Library¶

In [ ]:
import pandas as pd
import numpy as np
import PreProcessingText as ppt
from collections import Counter, defaultdict
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx
import matplotlib.pyplot as plt
import squarify
from transformers import pipeline
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired, PartOfSpeech
from sklearn.feature_extraction.text import CountVectorizer
from keybert import KeyBERT
from umap import UMAP
import hdbscan
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import csv
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.cluster import hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora

4° Approach: BERTopic¶

Baseline Summary¶

Clustering Approach¶

  • Parameter Setting: A high parameter was set for HDBSCAN to ensure well-defined clusters that occupy a significant percentage of the total dataset. This baseline is intended to feed machine learning algorithms for prediction purposes.

Initial Clustering Results¶

  • Clusters Retrieved: 7 representative clusters were identified:

    1. Drug sales
    2. Bitcoin
    3. Scammers and seller reviews
    4. Marketplace advertising
    5. Purchase reviews
    6. Drug purchases
    7. Orders
  • Outliers: Initially, 34k outliers were found out of a total of 66k records.

  • Performance Metrics:

    • Silhouette Score: 0.64
    • Davies-Bouldin Score: 0.6

Outlier Reduction¶

  • Cosine Measure on Embeddings: By applying a cosine similarity measure with a 0.53 threshold, the number of outliers was reduced from 34k to 27k, reintroducing about 7k records.

  • Updated Performance Metrics:

    • Silhouette Score: 0.51
    • Davies-Bouldin Score: 0.8

Trade-off Analysis¶

  • Outlier Reintroduction: Reintroducing the outliers found a balance that prevented significant cluster degradation while keeping clusters well separated and defined, as evidenced by the graphs.

  • Cluster Distribution: The updated clusters are well-distributed:

    • Maximum cluster size: 23% of the total dataset
    • Minimum cluster size: 7% of the total dataset
    • This distribution avoids large excursions.

Data Loss and Potential Adjustments¶

  • Data Loss: Approximately 40% of the initial dataset was lost.

  • Potential Correction: This data loss can potentially be mitigated by lowering the cosine similarity threshold between embeddings.

¶

In [272]:
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
Out[272]:
66735
In [ ]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
In [ ]:
len(tc1.corpus), len(tc1.corpus_embeddings)
In [ ]:
seed_topic_list = [[
    'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
    'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]

zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]

representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=1200, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    language='multilingual',
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model, 
    seed_topic_list=seed_topic_list, 
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=representation_model,
    zeroshot_topic_list=zeroshot_topic_list, 
    zeroshot_min_similarity=.05, 
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
In [21]:
print(topic_model.get_topic_info())
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
   Topic  Count                               Name  \
0     -1  34449            -1_new_free_ticket_help   
1      0   7495          0_weed_xanax_cocaine_coke   
2      1   6093    1_market_dream_empire_nightmare   
3      2   5034     2_vendor_scammer_scam_scamming   
4      3   4087      3_review_vendor_feedback_mdma   
5      4   4003              4_mdma_lsd_shit_whats   
6      5   2402  5_order_package_delivery_shipping   
7      6   1966       6_bitcoin_card_wallet_credit   

                                      Representation  \
0  [new, free, ticket, help, update, account, mdm...   
1  [weed, xanax, cocaine, coke, ketamine, mg, can...   
2  [market, dream, empire, nightmare, vendor, wal...   
3  [vendor, scammer, scam, scamming, exit, scamme...   
4  [review, vendor, feedback, mdma, mg, sample, r...   
5  [mdma, lsd, shit, whats, fuck, gone, got, guy,...   
6  [order, package, delivery, shipping, tracking,...   
7  [bitcoin, card, wallet, credit, coin, carding,...   

                                 Representative_Docs  
0  [canadianflavor weed shatter cbd edible hash c...  
1  [high quality weed thc product europe, new xan...  
2  [next market, dream market vendor rstclass nig...  
3  [looking good reliable vendor sell ounce, vend...  
4  [empire vendor cocaine review, first ever revi...  
5  [hey ro im gon na pull pk, life wonderful life...  
6  [order accepted day still hasnt marked shipped...  
7  [credit cards paypal prepaid card find, got cc...  
Topic 0:
[('weed', 0.5972313505812425), ('xanax', 0.5664832282989213), ('cocaine', 0.5350787342936356), ('coke', 0.4710111701375004), ('ketamine', 0.46985128023380035), ('mg', 0.46256209204548415), ('cannabis', 0.41853925594172725), ('drug', 0.4053330171594432), ('pill', 0.3907822559981816), ('quality', 0.38621568363790615)]
Topic 1:
[('market', 0.892430998800942), ('dream', 0.6865843677324943), ('empire', 0.6830028029033173), ('nightmare', 0.5681939396872522), ('vendor', 0.34305231363817884), ('wall', 0.3245499595042113), ('marketplace', 0.319921898437173), ('scam', 0.2961241301762431), ('exit', 0.2960733863924834), ('link', 0.2915460778160393)]
Topic 2:
[('vendor', 0.6950361459297074), ('scammer', 0.6725026815231682), ('scam', 0.4980623980369779), ('scamming', 0.46575246018365657), ('exit', 0.44160475610894967), ('scammed', 0.40051759892624533), ('looking', 0.37884048200047027), ('warning', 0.37715463753082534), ('reliable', 0.37144259341974245), ('buyer', 0.3708904841304073)]
Topic 3:
[('review', 1.002255217202406), ('vendor', 0.5076272530565451), ('feedback', 0.4049037794348937), ('mdma', 0.381329954044546), ('mg', 0.37619091451980585), ('sample', 0.3754397070467268), ('reviews', 0.3504300951320543), ('lsd', 0.3465899767001684), ('opinion', 0.3303160657068881), ('xanax', 0.33022254366369147)]
Topic 4:
[('mdma', 0.38275973612659386), ('lsd', 0.3779572278615291), ('shit', 0.35340590919386444), ('whats', 0.34834774258692336), ('fuck', 0.3264035078860319), ('gone', 0.31797094824590016), ('got', 0.3167851762249627), ('guy', 0.3153758862961693), ('dead', 0.31361936874635366), ('going', 0.3042237209259171)]
Topic 5:
[('order', 0.9350712100343167), ('package', 0.6655706541276237), ('delivery', 0.562721266995139), ('shipping', 0.527231820138037), ('tracking', 0.5122872117651205), ('shipped', 0.48839280205239965), ('ordering', 0.4784769909883374), ('cancelled', 0.47119974969542505), ('pack', 0.4566507281813944), ('delivered', 0.45351148583756845)]
Topic 6:
[('bitcoin', 0.8235475804294793), ('card', 0.7734286502423073), ('wallet', 0.6772588642347616), ('credit', 0.6731588060336892), ('coin', 0.5703668040987371), ('carding', 0.5529443276986676), ('btc', 0.5121844608207589), ('cash', 0.5037356917020909), ('debit', 0.500260454896595), ('coinbase', 0.49454000630077194)]
Topic -1:
[('new', 0.28398750337326484), ('free', 0.2771677713524054), ('ticket', 0.2699448449851029), ('help', 0.2697705189262906), ('update', 0.2675394807401724), ('account', 0.26547262677161937), ('mdma', 0.2638718211547908), ('vendor', 0.2588459510247759), ('dispute', 0.25440435619535773), ('need', 0.2488688355528112)]
In [22]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6388389468193054
Davies_bouldin_score: 0.5523262827209047
In [24]:
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
plt.figure(figsize=(10, 5))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar()
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
No description has been provided for this image
In [25]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [26]:
topic_model.get_topic_freq()
Out[26]:
Topic Count
0 -1 34449
5 0 7495
2 1 6093
1 2 5034
7 3 4087
6 4 4003
3 5 2402
4 6 1966
In [27]:
print(topic_model.get_topic_info())
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
   Topic  Count                                         Name  \
0     -1  34449                      -1_vendor_new_free_help   
1      0   7495                       0_weed_vendor_xanax_mg   
2      1   6093                 1_market_empire_dream_vendor   
3      2   5034                2_vendor_scammer_scam_looking   
4      3   4087  3_review_vendor review_vendor_review vendor   
5      4   4003                          4_mdma_lsd_good_got   
6      5   2402            5_order_package_shipping_delivery   
7      6   1966                 6_card_bitcoin_wallet_credit   

                                      Representation  \
0  [vendor, new, free, help, best, account, uk, u...   
1  [weed, vendor, xanax, mg, cocaine, uk, best, c...   
2  [market, empire, dream, vendor, nightmare, dre...   
3  [vendor, scammer, scam, looking, scamming, exi...   
4  [review, vendor review, vendor, review vendor,...   
5  [mdma, lsd, good, got, shit, whats, guy, fuck,...   
6  [order, package, shipping, delivery, vendor, p...   
7  [card, bitcoin, wallet, credit, btc, carding, ...   

                                 Representative_Docs  
0  [canadianflavor weed shatter cbd edible hash c...  
1  [high quality weed thc product europe, new xan...  
2  [next market, dream market vendor rstclass nig...  
3  [looking good reliable vendor sell ounce, vend...  
4  [empire vendor cocaine review, first ever revi...  
5  [hey ro im gon na pull pk, life wonderful life...  
6  [order accepted day still hasnt marked shipped...  
7  [credit cards paypal prepaid card find, got cc...  
Topic 0:
[('weed', 0.02425497350614531), ('vendor', 0.021978341010015688), ('xanax', 0.02077949072716719), ('mg', 0.01948517638840499), ('cocaine', 0.018417804414484252), ('uk', 0.015046793957699879), ('best', 0.013425752943917355), ('coke', 0.012717130457267087), ('ketamine', 0.01175969464362258), ('cannabis', 0.010948216683877144)]
Topic 1:
[('market', 0.09008978566905657), ('empire', 0.055274112551010335), ('dream', 0.04917325935832957), ('vendor', 0.024276714575283735), ('nightmare', 0.023605168431774765), ('dream market', 0.016025449931173885), ('empire market', 0.014646720705699409), ('new', 0.009033909010090109), ('nightmare market', 0.008867402221856543), ('scam', 0.006303868464254871)]
Topic 2:
[('vendor', 0.09965429794348642), ('scammer', 0.025788920958809015), ('scam', 0.017833603310448354), ('looking', 0.01337570071081538), ('scamming', 0.012208815488636926), ('exit', 0.011806364340026236), ('scammed', 0.008689720115543394), ('uk', 0.008678133768927804), ('good', 0.008493482524539575), ('warning', 0.008418582129949287)]
Topic 3:
[('review', 0.1428141634073404), ('vendor review', 0.058876246025626515), ('vendor', 0.05315846344525214), ('review vendor', 0.021049951157661017), ('review vendor review', 0.017406474951027713), ('review review', 0.015138695407876355), ('mg', 0.012888546716744416), ('mdma', 0.011146461993445255), ('sample', 0.010133356066428198), ('dream', 0.009783289767907996)]
Topic 4:
[('mdma', 0.011231558108969678), ('lsd', 0.009238251834183116), ('good', 0.007359917621616781), ('got', 0.006638868206622288), ('shit', 0.0065802885463340675), ('whats', 0.006051630264178851), ('guy', 0.005697866126116449), ('fuck', 0.005394916465354471), ('going', 0.005375411718474036), ('wsm', 0.0052967375805114646)]
Topic 5:
[('order', 0.09533424569336707), ('package', 0.025076372096897597), ('shipping', 0.02284913659637588), ('delivery', 0.018139605364174704), ('vendor', 0.014195026757439324), ('pack', 0.014024930561711633), ('tracking', 0.012976075064416448), ('shipped', 0.012741042718045418), ('ordering', 0.01153929794529684), ('time', 0.01087192180365464)]
Topic 6:
[('card', 0.04045581193563761), ('bitcoin', 0.03526436871145481), ('wallet', 0.02671909128748556), ('credit', 0.02286661027552805), ('btc', 0.0196385675748142), ('carding', 0.018970779081355412), ('coin', 0.016677548495845462), ('credit card', 0.014601870612078016), ('cash', 0.012420616388040553), ('bank', 0.010979756425111214)]
Topic -1:
[('vendor', 0.013820616851140987), ('new', 0.009152016420677532), ('free', 0.006913858221511509), ('help', 0.006453408973195096), ('best', 0.0060032500179123234), ('account', 0.005801364375676093), ('uk', 0.005664162822486113), ('update', 0.005547486073465391), ('crosspost', 0.005503646525948444), ('need', 0.00541678801673178)]
In [28]:
topic_model.visualize_topics()

7DistanceTimeSeries_0.641200-2.png

In [29]:
topic_model.visualize_heatmap()

7MatrixDistribution_0.64sil1200-2.png

In [30]:
topic_model.visualize_hierarchy()

image.png

In [31]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

7ClusterDistribution_0.64sil1200.png

In [32]:
topic_model.visualize_barchart()

image.png

In [ ]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.53)
topic_model.update_topics(tc1.corpus, topics=new_topics)
In [75]:
topic_model.get_topic_info()
Out[75]:
Topic Count Name Representation Representative_Docs
0 -1 27613 -1_anyone_new_help_free [anyone, new, help, free, please, update, tick... [canadianflavor weed shatter cbd edible hash c...
1 0 8645 0_weed_xanax_vendor_cocaine [weed, xanax, vendor, cocaine, mg, uk, coke, b... [high quality weed thc product europe, new xan...
2 1 6236 1_market_empire_dream_nightmare [market, empire, dream, nightmare, vendor, dre... [next market, dream market vendor rstclass nig...
3 2 6907 2_vendor_scammer_scam_looking [vendor, scammer, scam, looking, scamming, sal... [looking good reliable vendor sell ounce, vend...
4 3 4230 3_review_vendor review_vendor_review vendor [review, vendor review, vendor, review vendor,... [empire vendor cocaine review, first ever revi...
5 4 6299 4_mdma_lsd_get_looking [mdma, lsd, get, looking, wsm, good, btc, ques... [hey ro im gon na pull pk, life wonderful life...
6 5 2776 5_order_package_shipping_delivery [order, package, shipping, delivery, pack, shi... [order accepted day still hasnt marked shipped...
7 6 2823 6_bitcoin_card_wallet_btc [bitcoin, card, wallet, btc, bank, credit, car... [credit cards paypal prepaid card find, got cc...
In [76]:
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

image-2.png

In [77]:
topic_model.visualize_hierarchy()

image-2.png

In [78]:
topic_model.visualize_topics()

image-2.png

In [88]:
topic_model.visualize_barchart()

7BarChartDistribution_0.64sil1200_after.png

In [79]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5083789229393005
Davies_bouldin_score: 0.7570962651091117
In [30]:
topic_words = topic_model.get_topics()
topics = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in tc1.corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.40058884901572617
In [ ]:
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
In [ ]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
In [82]:
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)

7ClusterTimeSeries_0.641200-3.png

In [83]:
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')

results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(37916, 10)
Out[83]:
Document Embedding Topic Probability Created_on Count Name Representation Representative_Docs UMAP_embedding
0 checks [0.052164897, 0.029597273, -0.03666609, 0.0051... 4 0.000000 2020-01-09 6299 4_mdma_lsd_get_looking [mdma, lsd, get, looking, wsm, good, btc, ques... [hey ro im gon na pull pk, life wonderful life... [1.6488198, 9.914265, 1.442794, 2.8094368, -0....
1 trusted vendor status [0.02445144, -0.008732641, -0.0050215074, 0.01... 2 0.944247 2020-01-09 6907 2_vendor_scammer_scam_looking [vendor, scammer, scam, looking, scamming, sal... [looking good reliable vendor sell ounce, vend... [2.910516, 10.281041, 1.650234, 3.0320778, -0....
2 empire exit scam iiflux user incomming [0.02890829, 0.036081452, -0.027694924, -0.007... 1 1.000000 2019-11-06 6236 1_market_empire_dream_nightmare [market, empire, dream, nightmare, vendor, dre... [next market, dream market vendor rstclass nig... [1.5884036, 9.8587885, 3.3090453, 2.652358, 2....
3 ecstasy vendor packs [-0.022524439, 0.03949761, -0.023750877, 0.033... 5 0.797741 2020-01-09 2776 5_order_package_shipping_delivery [order, package, shipping, delivery, pack, shi... [order accepted day still hasnt marked shipped... [2.0245404, 10.517631, 2.3443217, 3.7595236, -...
4 opening bank account person fake id [-0.029834118, 0.03354508, -0.012210185, -0.02... 6 1.000000 2019-11-06 2823 6_bitcoin_card_wallet_btc [bitcoin, card, wallet, btc, bank, credit, car... [credit cards paypal prepaid card find, got cc... [0.7278271, 9.884823, 1.8116106, 2.9336705, -0...
In [ ]:
topic_model.save("Models/topic_model_0.64SilNew", serialization='pickle')
In [85]:
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_1200cluster_0.64sil_renewout.parquet')
In [86]:
sns.histplot(results_final, x='Topic', discrete=True);
No description has been provided for this image
In [87]:
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
No description has been provided for this image

500 min cluster size¶

In [ ]:
seed_topic_list = [[
    'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
    'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]

zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]

representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=500, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    language='multilingual',
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model, 
    seed_topic_list=seed_topic_list, 
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=representation_model,
    zeroshot_topic_list=zeroshot_topic_list, 
    zeroshot_min_similarity=.05, 
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
In [59]:
topic_model.get_topic_info()
Out[59]:
Topic Count Name Representation Representative_Docs
0 -1 28000 -1_mdma_new_link_lsd [mdma, new, link, lsd, help, free, vendor, nee... [need high quality fake id check , big thanks ...
1 0 4930 0_xanax_coke_cocaine_ketamine [xanax, coke, cocaine, ketamine, mg, drug, pil... [promo sale mg adderall ad xanax mg lsd mdma u...
2 1 4469 1_bitcoin_card_bank_carding [bitcoin, card, bank, carding, monero, wallet,... [way cash bank log using btc, send bitcoin get...
3 2 4227 2_dread_sub_lsd_shit [dread, sub, lsd, shit, mdma, whats, guy, fuck... [hey guy xangod man, let guy know dread host w...
4 3 3702 3_market_dream_nightmare_dreammarket [market, dream, nightmare, dreammarket, market... [not order nightmare market, nightmare market ...
5 4 3469 4_review_vendor_reviews_mg [review, vendor, reviews, mg, vendymcvendface,... [thclear ml purple kush vape cart review, vend...
6 5 3410 5_order_package_pack_dispute [order, package, pack, dispute, delivery, ship... [package custom month love letter nothing, pac...
7 6 2700 6_vendor_looking_seller_vendors [vendor, looking, seller, vendors, buyer, lsd,... [best vendor uk lsd, looking good vendor cc fu...
8 7 1694 7_weed_cannabis_marijuana_hash [weed, cannabis, marijuana, hash, quality, str... [hash weed ship eu good vendor also usa, new i...
9 8 1540 8_darknet_dark_web_sentenced [darknet, dark, web, sentenced, drug, darkweb,... [tacoma man sentenced four year dealing drugs ...
10 9 1502 9_empire_dispute_deposit_empiremarket [empire, dispute, deposit, empiremarket, scamm... [empire next, give me empire, empire anyone else]
11 10 1475 10_account_password_pgp_hacking [account, password, pgp, hacking, hacked, secu... [vendor enerygcontrolled hacked ca nt log pass...
12 11 1314 11_tried_anybody_heard_ordered [tried, anybody, heard, ordered, used, recentl... [anybody heard pasitheas, anyone order recentl...
13 12 1031 12_scammer_scam_exit_scamming [scammer, scam, exit, scamming, warning, scamm... [xangod scammer going exit scam proof, cottage...
14 13 777 13_update_maintenance_updated_upgrade [update, maintenance, updated, upgrade, vender... [shipping update, update order, vendor update]
15 14 681 14_ticket_support_deposit_month [ticket, support, deposit, month, response, an... [support ticket ticket, please help support ti...
16 15 608 15_sample_samples_free_test [sample, samples, free, test, testing, lab, te... [xanax mg shipping free samples, new vendor fr...
In [60]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5718363523483276
Davies_bouldin_score: 0.6211900149809264
In [61]:
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
unique_labels = np.unique(labels)
cmap = plt.cm.magma
plt.figure(figsize=(10, 5))
scatter = plt.scatter(X[:, 1], X[:, 2], c=labels, cmap=cmap, s=5)
plt.gca().set_aspect('equal', 'datalim')
norm = plt.Normalize(vmin=min(labels), vmax=max(labels))
handles = [plt.Line2D([0], [0], marker='o', color=cmap(norm(label)), linestyle='', markersize=10) for label in unique_labels]
legend_labels = [f'Class {label}' for label in unique_labels]
plt.legend(handles, legend_labels, title="Classes")
plt.colorbar(scatter, ticks=range(len(unique_labels)))
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
No description has been provided for this image
In [62]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [63]:
topic_model.visualize_topics()

16DistanceDistribution_0.5sil500.png

In [64]:
topic_model.visualize_heatmap()

16MatrixDistribution_0.5sil500.png

In [65]:
topic_model.visualize_hierarchy()

16HirachicalDistribution_0.5sil500.png

In [66]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

16ClusterDistribution_0.5sil500.png

In [ ]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.6)
topic_model.update_topics(tc1.corpus, topics=new_topics)
In [96]:
topic_model.get_topic_info()
Out[96]:
Topic Count Name Representation Representative_Docs
0 -1 23928 -1_new_vendor_help_uk [new, vendor, help, uk, need, mdma, best, free... [need high quality fake id check , big thanks ...
1 0 5207 0_xanax_cocaine_mg_coke [xanax, cocaine, mg, coke, ketamine, vendor, p... [promo sale mg adderall ad xanax mg lsd mdma u...
2 1 4512 1_bitcoin_card_bank_carding [bitcoin, card, bank, carding, monero, wallet,... [way cash bank log using btc, send bitcoin get...
3 2 4944 2_dread_mdma_lsd_get [dread, mdma, lsd, get, sub, shit, guy, lookin... [hey guy xangod man, let guy know dread host w...
4 3 3801 3_market_dream_nightmare_dream market [market, dream, nightmare, dream market, vendo... [not order nightmare market, nightmare market ...
5 4 3706 4_review_vendor review_vendor_review vendor [review, vendor review, vendor, review vendor,... [thclear ml purple kush vape cart review, vend...
6 5 3434 5_order_dispute_pack_package [order, dispute, pack, package, shipping, deli... [package custom month love letter nothing, pac...
7 6 4123 6_vendor_vendor vendor_looking_best [vendor, vendor vendor, looking, best, inquiry... [best vendor uk lsd, looking good vendor cc fu...
8 7 1848 7_weed_cannabis_uk_weed vendor [weed, cannabis, uk, weed vendor, vendor, qual... [hash weed ship eu good vendor also usa, new i...
9 8 1557 8_darknet_dark_dark web_web [darknet, dark, dark web, web, drug, sentenced... [tacoma man sentenced four year dealing drugs ...
10 9 1835 9_empire_empire market_market_empire empire [empire, empire market, market, empire empire,... [empire next, give me empire, empire anyone else]
11 10 1542 10_account_pgp_password_vendor account [account, pgp, password, vendor account, crypt... [vendor enerygcontrolled hacked ca nt log pass...
12 11 1394 11_anyone_has_has anyone_anybody [anyone, has, has anyone, anybody, tried, anyo... [anybody heard pasitheas, anyone order recentl...
13 12 1398 12_scammer_scam_exit_scamming [scammer, scam, exit, scamming, scammed, warni... [xangod scammer going exit scam proof, cottage...
14 13 826 13_update_maintenance_updated_update update [update, maintenance, updated, update update, ... [shipping update, update order, vendor update]
15 14 682 14_ticket_support ticket_support_please [ticket, support ticket, support, please, depo... [support ticket ticket, please help support ti...
16 15 792 15_sample_free_free sample_samples [sample, free, free sample, samples, free samp... [xanax mg shipping free samples, new vendor fr...
In [97]:
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

16ClusterDistribution_0.5sil500_after.png

In [98]:
topic_model.visualize_topics()

16DistanceDistribution_0.5sil500_after.png

In [106]:
topic_model.visualize_hierarchy()

16HirachicalDistribution_0.5sil500_after.png

In [15]:
topic_model.visualize_barchart(top_n_topics=16)

16BarChartDistribution_0.5sil500_after-2.png

In [99]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.49986162781715393
Davies_bouldin_score: 0.7193546666619981
In [28]:
topic_words = topic_model.get_topics()
topics = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in tc1.corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.4902822303421074
In [100]:
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')

results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_500cluster_0.54sil_renewout.parquet')
(41601, 10)
In [ ]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)

16ClusterTimeSeries500.png

In [107]:
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
No description has been provided for this image
In [108]:
sns.histplot(results_final, x='Topic', discrete=True);
No description has been provided for this image
In [ ]:
topic_model.save("Models/topic_model_0.50Sil300", serialization='pickle')

400 all-MiniLM-L6-v2¶

In [53]:
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
Out[53]:
66735
In [ ]:
model = SentenceTransformer('all-MiniLM-L6-v2')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
In [ ]:
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=[mmr, kw],
    embedding_model=model,
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
In [23]:
print(topic_model.get_topic_info())
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
    Topic  Count                                               Name  \
0      -1  30941                         -1_customer_buy_sale_buyer   
1       0   5117                     0_vape_shatter_carts_cartridge   
2       1   2643                      1_login_password_logged_error   
3       2   2579                2_coca_opium_cocain_cocacolacompany   
4       3   2124               3_xanaxlabs_xanaxlife_xanax_xanaxusa   
5       4   1938                     4_postal_usps_delivery_postage   
6       5   1842     5_darkweb_darknetlive_darknetmarkets_sentenced   
7       6   1721      6_empire_empiremarket_empireteam_empiredealer   
8       7   1631                 7_mdma_mdmamaster_pill_ecstasydata   
9       8   1601               8_giftcard_card_giftcards_mastercard   
10      9   1502              9_vendor_vendorpro_vendors_vendorbbmc   
11     10   1417                  10_scamming_scammer_scam_scammers   
12     11   1126      11_counterfeiting_passport_counterfeit_fakeid   
13     12   1072       12_dreammarket_nightmaremarket_market_dreams   
14     13    979                            13_lsd_tab_tabs_shrooms   
15     14    739                      14_monero_coinbase_coin_coins   
16     15    676               15_review_reviewing_reviews_reviewer   
17     16    674           16_pickledrick_heard_theoutfit_muttznutz   
18     17    669            17_market_markets_marketplace_marketing   
19     18    626         18_crosspost_deposting_goingpostal_vendors   
20     19    603              19_deposit_depositing_deposits_ticket   
21     20    573                              20_pgpkey_pgp_pgps_pg   
22     21    535                  21_mod_moderator_dispute_disputes   
23     22    450  22_cryptonia_cryptoniausers_cryptonians_cryptn...   
24     23    445                 23_wsm_wsms_vendorcp_machinerymint   
25     24    443     24_ketamine_ketamin_ketamineking_ketaminekings   
26     25    434             25_ticket_ticketmaster_ticketw_tickets   
27     26    429        26_meth_methbusters_methamphetamine_crystal   

                                       Representation  \
0   [customer, buy, sale, buyer, service, message,...   
1   [vape, shatter, carts, cartridge, ounce, marij...   
2   [login, password, logged, error, problem, log,...   
3   [coca, opium, cocain, cocacolacompany, coke, c...   
4   [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr...   
5   [postal, usps, delivery, postage, mail, delive...   
6   [darkweb, darknetlive, darknetmarkets, sentenc...   
7   [empire, empiremarket, empireteam, empiredeale...   
8   [mdma, mdmamaster, pill, ecstasydata, mdmaus, ...   
9   [giftcard, card, giftcards, mastercard, cards,...   
10  [vendor, vendorpro, vendors, vendorbbmc, vendo...   
11  [scamming, scammer, scam, scammers, scammed, s...   
12  [counterfeiting, passport, counterfeit, fakeid...   
13  [dreammarket, nightmaremarket, market, dreams,...   
14  [lsd, tab, tabs, shrooms, acid, blotter, blott...   
15  [monero, coinbase, coin, coins, cryptocurrency...   
16  [review, reviewing, reviews, reviewer, reviewe...   
17  [pickledrick, heard, theoutfit, muttznutz, hou...   
18  [market, markets, marketplace, marketing, nonm...   
19  [crosspost, deposting, goingpostal, vendors, c...   
20  [deposit, depositing, deposits, ticket, deposi...   
21  [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ...   
22  [mod, moderator, dispute, disputes, disputers,...   
23  [cryptonia, cryptoniausers, cryptonians, crypt...   
24  [wsm, wsms, vendorcp, machinerymint, wowza, pa...   
25  [ketamine, ketamin, ketamineking, ketamineking...   
26  [ticket, ticketmaster, ticketw, tickets, suppo...   
27  [meth, methbusters, methamphetamine, crystal, ...   

                                  Representative_Docs  
0   [dutchdrugz updates promo active till market p...  
1   [sale girl scout cookie carts strains oz lb us...  
2   [hey really could use help advice thanks, erro...  
3   [colombian coke brazil ship world wide promoti...  
4   [adderall mg ir adderall mg xanax super sale, ...  
5   [informed delivery showing package, usa canada...  
6   [three student arrested dark web drug traffick...  
7   [empire anyone else, empire market back, empir...  
8   [sale xtc pill mg mda us ca, uk mdma pill vend...  
9   [carding amazon gift card, gift card prepaid d...  
10  [nmm giving vendor runaround lying acting shad...  
11  [market exit scam next, scam alert ukdrugdeale...  
12  [buy counterfeit money real fake document, buy...  
13  [dream market still, dream market, eleven drea...  
14  [lsd blotter tab ug top quality, point one fre...  
15  [looking best safe way buy large amount bitcoi...  
16  [needing send sample bar trusted reviewer woul...  
17  [anybody heard theoutfit, anybody heard pickle...  
18  [market anyone else, market, currently working...  
19  [envoy want crosspost, could vendor crosspost,...  
20  [missing deposit double deposit please help, a...  
21     [pgp public key, market pgp key, find pgp key]  
22  [moderator dispute day, moderator please help ...  
23  [cryptonia market, market king samsara crypton...  
24                   [wsm vendor, wsm back, wsm down]  
25       [ketamine us, get ketamine, ketamine anyone]  
26  [help support ticket please, help support tick...  
27  [crystal meth uk, crystal meth, crystal meth v...  
Topic 0:
[('vape', 0.4513024), ('shatter', 0.4508166), ('carts', 0.42475972), ('cartridge', 0.4150574), ('ounce', 0.38511506), ('marijuana', 0.3761327), ('cannabis', 0.37473193), ('edibles', 0.36946523), ('weed', 0.35874215), ('cart', 0.3494926)]
Topic 1:
[('login', 0.6874596), ('password', 0.58739483), ('logged', 0.44535103), ('error', 0.39473626), ('problem', 0.38404456), ('log', 0.3703017), ('account', 0.36962464), ('help', 0.36578366), ('trouble', 0.3579351), ('session', 0.34920555)]
Topic 2:
[('coca', 0.5442445), ('opium', 0.5241908), ('cocain', 0.48566723), ('cocacolacompany', 0.47682497), ('coke', 0.4701375), ('cocainehcl', 0.4403491), ('cocaine', 0.43470532), ('heroinfactory', 0.43406424), ('colombian', 0.40406665), ('cokemaster', 0.39702898)]
Topic 3:
[('xanaxlabs', 0.68098766), ('xanaxlife', 0.6694618), ('xanax', 0.64481914), ('xanaxusa', 0.5943617), ('xanaxring', 0.5927005), ('xanaxdepot', 0.5860753), ('xanaxdaddy', 0.57530177), ('xanaxblotters', 0.5676911), ('alprazolam', 0.5388765), ('xanaxinc', 0.5038374)]
Topic 4:
[('postal', 0.5783647), ('usps', 0.5671008), ('delivery', 0.552514), ('postage', 0.5435632), ('mail', 0.4794371), ('deliver', 0.46840727), ('package', 0.4595977), ('shipment', 0.4503156), ('shipping', 0.44325382), ('fedex', 0.44258836)]
Topic 5:
[('darkweb', 0.5460649), ('darknetlive', 0.47999817), ('darknetmarkets', 0.46108282), ('sentenced', 0.4581046), ('darknetmarketsnoobs', 0.4534067), ('darknet', 0.45285586), ('darkbay', 0.45059866), ('darkfail', 0.44140962), ('darkdotfail', 0.42702472), ('darknetaustralia', 0.42165762)]
Topic 6:
[('empire', 0.8657665), ('empiremarket', 0.8325376), ('empireteam', 0.7658358), ('empiredealer', 0.73584473), ('empires', 0.7089321), ('imperial', 0.59743464), ('imperialroyalty', 0.533589), ('market', 0.39446667), ('scammer', 0.3011508), ('nightmare', 0.29797795)]
Topic 7:
[('mdma', 0.57491755), ('mdmamaster', 0.55362886), ('pill', 0.54554516), ('ecstasydata', 0.54158187), ('mdmaus', 0.536477), ('mdacanada', 0.49906433), ('mda', 0.47733676), ('md', 0.47456974), ('ecstasy', 0.46981525), ('mg', 0.45221412)]
Topic 8:
[('giftcard', 0.68464833), ('card', 0.6067195), ('giftcards', 0.60337466), ('mastercard', 0.5686253), ('cards', 0.5325688), ('carding', 0.5214343), ('debit', 0.500812), ('carded', 0.49536285), ('carder', 0.48081687), ('cardable', 0.45047107)]
Topic 9:
[('vendor', 0.6717965), ('vendorpro', 0.64170885), ('vendors', 0.63945156), ('vendorbbmc', 0.6131782), ('vendorshop', 0.5619679), ('supplier', 0.4961744), ('shop', 0.43687624), ('inventory', 0.38063982), ('dealer', 0.37658587), ('trusted', 0.35675985)]
Topic 10:
[('scamming', 0.67339057), ('scammer', 0.64245546), ('scam', 0.6315777), ('scammers', 0.60618246), ('scammed', 0.5859374), ('scams', 0.5844768), ('exit', 0.38286078), ('ukdrugdealer', 0.37872887), ('warning', 0.35860184), ('confirmed', 0.3483911)]
Topic 11:
[('counterfeiting', 0.5351553), ('passport', 0.49532643), ('counterfeit', 0.48550797), ('fakeid', 0.46835682), ('forgery', 0.46821818), ('passports', 0.46553856), ('certificate', 0.46403533), ('fakeids', 0.36332572), ('licenses', 0.3491515), ('citizenship', 0.33687454)]
Topic 12:
[('dreammarket', 0.840524), ('nightmaremarket', 0.7301478), ('market', 0.679103), ('dreams', 0.5537206), ('nightmare', 0.54951864), ('dream', 0.52395815), ('dreaming', 0.51259714), ('nightmares', 0.5112673), ('dreamweaver', 0.4622426), ('deals', 0.4392535)]
Topic 13:
[('lsd', 0.6597349), ('tab', 0.4486916), ('tabs', 0.42244914), ('shrooms', 0.40983063), ('acid', 0.37709463), ('blotter', 0.3619333), ('blotters', 0.34030285), ('microdose', 0.31792137), ('dmt', 0.30784056), ('samspade', 0.306018)]
Topic 14:
[('monero', 0.66440576), ('coinbase', 0.6017641), ('coin', 0.58206344), ('coins', 0.55229485), ('cryptocurrency', 0.54781383), ('crypto', 0.5190888), ('bitcoin', 0.49815544), ('btc', 0.4951193), ('cryptocurrencies', 0.49073264), ('bitcoins', 0.48276216)]
Topic 15:
[('review', 0.7554549), ('reviewing', 0.70764035), ('reviews', 0.67082256), ('reviewer', 0.6707778), ('reviewed', 0.66799235), ('vendor', 0.3507808), ('post', 0.3232708), ('sample', 0.3039448), ('journal', 0.28708428), ('dankservices', 0.2783244)]
Topic 16:
[('pickledrick', 0.49188858), ('heard', 0.45528996), ('theoutfit', 0.4499943), ('muttznutz', 0.40856874), ('houseofdank', 0.38270152), ('purepharm', 0.3821613), ('thecandymanuk', 0.38004813), ('ndduk', 0.3797817), ('uzak', 0.37892848), ('turk', 0.37287065)]
Topic 17:
[('market', 0.9246511), ('markets', 0.82856095), ('marketplace', 0.66924006), ('marketing', 0.64059925), ('nonmarket', 0.63226146), ('undermarket', 0.5758176), ('traderoute', 0.5252505), ('farmersmarket', 0.51230544), ('demand', 0.48939776), ('trade', 0.4373095)]
Topic 18:
[('crosspost', 0.8023433), ('deposting', 0.54462177), ('goingpostal', 0.4369921), ('vendors', 0.3397432), ('courier', 0.31433263), ('tarred', 0.30136013), ('expose', 0.28236645), ('shop', 0.26232204), ('buyers', 0.25981808), ('weareamsterdam', 0.25617945)]
Topic 19:
[('deposit', 0.5940467), ('depositing', 0.54835135), ('deposits', 0.4703769), ('ticket', 0.4124618), ('deposited', 0.37039375), ('transaction', 0.32960162), ('btc', 0.29055083), ('fund', 0.28815228), ('unconfirmed', 0.28022093), ('twice', 0.27061075)]
Topic 20:
[('pgpkey', 0.78953433), ('pgp', 0.64266664), ('pgps', 0.60433674), ('pg', 0.57204497), ('pgc', 0.5202303), ('gnupg', 0.49523085), ('key', 0.4912796), ('gpg', 0.45877883), ('keys', 0.42667422), ('pgplogin', 0.40541986)]
Topic 21:
[('mod', 0.6461178), ('moderator', 0.6455801), ('dispute', 0.63188905), ('disputes', 0.53940743), ('disputers', 0.5393207), ('mods', 0.5271941), ('complaint', 0.47743487), ('modderator', 0.43813834), ('consensus', 0.3737623), ('handled', 0.37211758)]
Topic 22:
[('cryptonia', 0.82683897), ('cryptoniausers', 0.7519192), ('cryptonians', 0.7422215), ('cryptnonia', 0.6530852), ('cryptoni', 0.6209998), ('cryptoice', 0.5572725), ('market', 0.5073216), ('samasara', 0.42220467), ('samsera', 0.42188087), ('samsara', 0.3912958)]
Topic 23:
[('wsm', 0.8689953), ('wsms', 0.6338644), ('vendorcp', 0.41763154), ('machinerymint', 0.36969972), ('wowza', 0.36484522), ('paymwn', 0.32914096), ('maintenance', 0.31149185), ('greennz', 0.3085622), ('bionik', 0.30364022), ('bioniks', 0.30257553)]
Topic 24:
[('ketamine', 0.9532861), ('ketamin', 0.86957943), ('ketamineking', 0.8578399), ('ketaminekings', 0.8378519), ('ketaminehouse', 0.8028732), ('ketamax', 0.69982356), ('ketaconnect', 0.527894), ('tiletamine', 0.5001087), ('pyrimethamine', 0.48265585), ('pharmaceutical', 0.43739906)]
Topic 25:
[('ticket', 0.7282917), ('ticketmaster', 0.6860643), ('ticketw', 0.65911514), ('tickets', 0.62922376), ('support', 0.51385075), ('concert', 0.37351736), ('help', 0.29014573), ('assist', 0.28098187), ('fix', 0.27553594), ('outstanding', 0.27276954)]
Topic 26:
[('meth', 0.7546984), ('methbusters', 0.71206135), ('methamphetamine', 0.6617794), ('crystal', 0.6237694), ('methamph', 0.6163767), ('methoxetamine', 0.6146395), ('methadone', 0.58694017), ('dmethamphetamine', 0.5264992), ('methaqualone', 0.49982086), ('amphetamine', 0.49571955)]
Topic -1:
[('customer', 0.44219303), ('buy', 0.42263174), ('sale', 0.38992852), ('buyer', 0.38299185), ('service', 0.38183293), ('message', 0.37282392), ('update', 0.37055105), ('price', 0.37036857), ('paypal', 0.35097662), ('legit', 0.34381357)]
In [24]:
topic_model.get_topic_info()
Out[24]:
Topic Count Name Representation Representative_Docs
0 -1 30941 -1_customer_buy_sale_buyer [customer, buy, sale, buyer, service, message,... [dutchdrugz updates promo active till market p...
1 0 5117 0_vape_shatter_carts_cartridge [vape, shatter, carts, cartridge, ounce, marij... [sale girl scout cookie carts strains oz lb us...
2 1 2643 1_login_password_logged_error [login, password, logged, error, problem, log,... [hey really could use help advice thanks, erro...
3 2 2579 2_coca_opium_cocain_cocacolacompany [coca, opium, cocain, cocacolacompany, coke, c... [colombian coke brazil ship world wide promoti...
4 3 2124 3_xanaxlabs_xanaxlife_xanax_xanaxusa [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr... [adderall mg ir adderall mg xanax super sale, ...
5 4 1938 4_postal_usps_delivery_postage [postal, usps, delivery, postage, mail, delive... [informed delivery showing package, usa canada...
6 5 1842 5_darkweb_darknetlive_darknetmarkets_sentenced [darkweb, darknetlive, darknetmarkets, sentenc... [three student arrested dark web drug traffick...
7 6 1721 6_empire_empiremarket_empireteam_empiredealer [empire, empiremarket, empireteam, empiredeale... [empire anyone else, empire market back, empir...
8 7 1631 7_mdma_mdmamaster_pill_ecstasydata [mdma, mdmamaster, pill, ecstasydata, mdmaus, ... [sale xtc pill mg mda us ca, uk mdma pill vend...
9 8 1601 8_giftcard_card_giftcards_mastercard [giftcard, card, giftcards, mastercard, cards,... [carding amazon gift card, gift card prepaid d...
10 9 1502 9_vendor_vendorpro_vendors_vendorbbmc [vendor, vendorpro, vendors, vendorbbmc, vendo... [nmm giving vendor runaround lying acting shad...
11 10 1417 10_scamming_scammer_scam_scammers [scamming, scammer, scam, scammers, scammed, s... [market exit scam next, scam alert ukdrugdeale...
12 11 1126 11_counterfeiting_passport_counterfeit_fakeid [counterfeiting, passport, counterfeit, fakeid... [buy counterfeit money real fake document, buy...
13 12 1072 12_dreammarket_nightmaremarket_market_dreams [dreammarket, nightmaremarket, market, dreams,... [dream market still, dream market, eleven drea...
14 13 979 13_lsd_tab_tabs_shrooms [lsd, tab, tabs, shrooms, acid, blotter, blott... [lsd blotter tab ug top quality, point one fre...
15 14 739 14_monero_coinbase_coin_coins [monero, coinbase, coin, coins, cryptocurrency... [looking best safe way buy large amount bitcoi...
16 15 676 15_review_reviewing_reviews_reviewer [review, reviewing, reviews, reviewer, reviewe... [needing send sample bar trusted reviewer woul...
17 16 674 16_pickledrick_heard_theoutfit_muttznutz [pickledrick, heard, theoutfit, muttznutz, hou... [anybody heard theoutfit, anybody heard pickle...
18 17 669 17_market_markets_marketplace_marketing [market, markets, marketplace, marketing, nonm... [market anyone else, market, currently working...
19 18 626 18_crosspost_deposting_goingpostal_vendors [crosspost, deposting, goingpostal, vendors, c... [envoy want crosspost, could vendor crosspost,...
20 19 603 19_deposit_depositing_deposits_ticket [deposit, depositing, deposits, ticket, deposi... [missing deposit double deposit please help, a...
21 20 573 20_pgpkey_pgp_pgps_pg [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ... [pgp public key, market pgp key, find pgp key]
22 21 535 21_mod_moderator_dispute_disputes [mod, moderator, dispute, disputes, disputers,... [moderator dispute day, moderator please help ...
23 22 450 22_cryptonia_cryptoniausers_cryptonians_cryptn... [cryptonia, cryptoniausers, cryptonians, crypt... [cryptonia market, market king samsara crypton...
24 23 445 23_wsm_wsms_vendorcp_machinerymint [wsm, wsms, vendorcp, machinerymint, wowza, pa... [wsm vendor, wsm back, wsm down]
25 24 443 24_ketamine_ketamin_ketamineking_ketaminekings [ketamine, ketamin, ketamineking, ketamineking... [ketamine us, get ketamine, ketamine anyone]
26 25 434 25_ticket_ticketmaster_ticketw_tickets [ticket, ticketmaster, ticketw, tickets, suppo... [help support ticket please, help support tick...
27 26 429 26_meth_methbusters_methamphetamine_crystal [meth, methbusters, methamphetamine, crystal, ... [crystal meth uk, crystal meth, crystal meth v...
In [25]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6434006690979004
Davies_bouldin_score: 0.4681034572960446
In [26]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [27]:
topic_model.visualize_topics()

27DistanceTimeSeries_0.65_400.png

In [28]:
topic_model.visualize_heatmap()

image.png

In [29]:
topic_model.visualize_hierarchy()

image.png

In [30]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

image.png

In [121]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.5)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-27 14:34:02,549 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
Out[121]:
Topic Count Name Representation Representative_Docs
0 -1 27323 -1_anyone_vendor_order_review [anyone, vendor, order, review, new, get, acco... [dutchdrugz updates promo active till market p...
1 0 5137 0_weed_cannabis_cart_review [weed, cannabis, cart, review, thc, vendor, oz... [sale girl scout cookie carts strains oz lb us...
2 1 2700 1_help_login_need_account [help, login, need, account, sub, back, passwo... [hey really could use help advice thanks, erro...
3 2 2601 2_cocaine_coke_heroin_drug [cocaine, coke, heroin, drug, vendor, uk, best... [colombian coke brazil ship world wide promoti...
4 3 2270 3_xanax_mg_adderall_alprazolam [xanax, mg, adderall, alprazolam, bar, diazepa... [adderall mg ir adderall mg xanax super sale, ...
5 4 2031 4_order_shipping_package_delivery [order, shipping, package, delivery, shipped, ... [informed delivery showing package, usa canada...
6 5 1861 5_darknet_dark_tor_web [darknet, dark, tor, web, onion, dark web, dar... [three student arrested dark web drug traffick...
7 6 1826 6_empire_empire market_empire empire_market [empire, empire market, empire empire, market,... [empire anyone else, empire market back, empir...
8 7 1653 7_mdma_pill_mda_xtc [mdma, pill, mda, xtc, mdma vendor, mg, usa, p... [sale xtc pill mg mda us ca, uk mdma pill vend...
9 8 1628 8_card_carding_cc_credit [card, carding, cc, credit, cvv, credit card, ... [carding amazon gift card, gift card prepaid d...
10 9 3010 9_vendor_vendor vendor_inquiry_vendor inquiry [vendor, vendor vendor, inquiry, vendor inquir... [nmm giving vendor runaround lying acting shad...
11 10 1741 10_scam_scammer_exit_scamming [scam, scammer, exit, scamming, scammed, exit ... [market exit scam next, scam alert ukdrugdeale...
12 11 1147 11_counterfeit_id_fake_passport [counterfeit, id, fake, passport, fake id, not... [buy counterfeit money real fake document, buy...
13 12 1202 12_dream_nightmare_dream market_market [dream, nightmare, dream market, market, night... [dream market still, dream market, eleven drea...
14 13 1009 13_lsd_ug_tab_lsd vendor [lsd, ug, tab, lsd vendor, acid, free, lsd tab... [lsd blotter tab ug top quality, point one fre...
15 14 854 14_monero_btc_bitcoin_coin [monero, btc, bitcoin, coin, crypto, wallet, b... [looking best safe way buy large amount bitcoi...
16 15 926 15_review_vendor review_vendor_review vendor [review, vendor review, vendor, review vendor,... [needing send sample bar trusted reviewer woul...
17 16 681 16_heard_anyone_anyone heard_happened [heard, anyone, anyone heard, happened, has, h... [anybody heard theoutfit, anybody heard pickle...
18 17 989 17_market_market market_new market_new [market, market market, new market, new, apoll... [market anyone else, market, currently working...
19 18 764 18_crosspost_review crosspost_crosspost vendor... [crosspost, review crosspost, crosspost vendor... [envoy want crosspost, could vendor crosspost,...
20 19 671 19_deposit_deposited_ticket_address [deposit, deposited, ticket, address, double, ... [missing deposit double deposit please help, a...
21 20 596 20_pgp_key_pgp key_public [pgp, key, pgp key, public, public pgp, messag... [pgp public key, market pgp key, find pgp key]
22 21 551 21_dispute_dispute dispute_mod_moderator [dispute, dispute dispute, mod, moderator, ple... [moderator dispute day, moderator please help ...
23 22 480 22_cryptonia_samsara_samsara market_cryptonia ... [cryptonia, samsara, samsara market, cryptonia... [cryptonia market, market king samsara crypton...
24 23 485 23_wsm_wsm wsm_wsm vendor_vendor wsm [wsm, wsm wsm, wsm vendor, vendor wsm, vendor,... [wsm vendor, wsm back, wsm down]
25 24 468 24_ketamine_ketamine vendor_mdma ketamine_keta... [ketamine, ketamine vendor, mdma ketamine, ket... [ketamine us, get ketamine, ketamine anyone]
26 25 458 25_ticket_support ticket_support_please [ticket, support ticket, support, please, mont... [help support ticket please, help support tick...
27 26 467 26_meth_crystal meth_crystal_meth vendor [meth, crystal meth, crystal, meth vendor, met... [crystal meth uk, crystal meth, crystal meth v...
In [153]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
In [ ]:
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
dict_zero_shots_2 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.2)
dict_zero_shots_17 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.17)
dict_zero_shots_15 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.15)
In [392]:
dict_zero_shots_2[18] = 'crosspost vendor'
dict_zero_shots_2[22] = 'samsara market'
dict_zero_shots_2[23] = 'wsm market'
In [390]:
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_025.csv', index=False)
pd.DataFrame(list(dict_zero_shots_2.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_020.csv', index=False)
pd.DataFrame(list(dict_zero_shots_17.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_017.csv', index=False)
pd.DataFrame(list(dict_zero_shots_15.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_015.csv', index=False)
In [395]:
topic_model.set_topic_labels(dict_zero_shots_2)
In [396]:
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True, custom_labels=True)

image.png

In [397]:
topic_model.visualize_hierarchy(custom_labels=True)

image.png

In [44]:
topic_model.visualize_topics()

image.png

In [400]:
topic_model.visualize_barchart(top_n_topics=25, custom_labels=True, n_words=10)

image.png

In [125]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5175204277038574
Davies_bouldin_score: 0.7919422601150089
In [26]:
topic_words = topic_model.get_topics()
topics = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in tc1.corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.5751057167740472
In [ ]:
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
In [406]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:24,  1.62s/it]

image.png

In [403]:
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(38274, 11)
Out[403]:
Document Embedding Topic Probability Created_on Count Name CustomName Representation Representative_Docs UMAP_embedding
0 review empire vendor acidbern [-0.07762138, -0.049061198, -0.046745114, -0.0... 6 0.527385 2020-01-09 1826 6_empire_empire market_empire empire_market empire market [empire, empire market, empire empire, market,... [empire anyone else, empire market back, empir... [9.086779, 3.6718397, 8.9006195, -1.1745992, 1...
1 vendor shipping combine priority [-0.027722627, -0.0031221025, 0.01195772, -0.0... 4 0.962274 2019-11-06 2031 4_order_shipping_package_delivery order [order, shipping, package, delivery, shipped, ... [informed delivery showing package, usa canada... [9.679236, 2.7164314, 8.733615, 0.011899776, 8...
2 open ticket since may ticket [0.055031013, -0.018210536, -0.0026789573, -0.... 25 1.000000 2020-01-09 458 25_ticket_support ticket_support_please ticket support - ask help [ticket, support ticket, support, please, mont... [help support ticket please, help support tick... [9.901975, 5.2703958, 11.463735, 0.47217792, 8...
3 vendor inquiry destroid dream [-0.023196185, 0.0573189, 0.028408512, -0.0222... 9 0.000000 2019-11-06 3010 9_vendor_vendor vendor_inquiry_vendor inquiry inquiry - vendor vendor - vendor [vendor, vendor vendor, inquiry, vendor inquir... [nmm giving vendor runaround lying acting shad... [9.912251, 4.028657, 7.623224, -0.7158077, 9.2...
4 morrison saver stamps uk money maker easiest m... [-0.020903945, 0.050762244, -0.041445963, 0.01... 11 0.799023 2020-01-09 1147 11_counterfeit_id_fake_passport counterfeit money - fake IDs [counterfeit, id, fake, passport, fake id, not... [buy counterfeit money real fake document, buy... [9.859931, 3.1459394, 9.145497, -1.0489817, 9....
In [ ]:
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_400", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
In [405]:
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_400.parquet')

200 all-MiniLM-L6-v2¶

In [ ]:
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=200, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=[mmr, kw],
    embedding_model=model,
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
In [ ]:
print(topic_model.get_topic_info())
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
In [6]:
topic_model.get_topic_info()
Out[6]:
Topic Count Name Representation Representative_Docs
0 -1 23926 -1_opiateconnect_heinekenexpress_buy_dmt [opiateconnect, heinekenexpress, buy, dmt, sho... [good source dmt dream market, tramadol mg mg ...
1 0 5394 0_carts_vape_cart_cannabis [carts, vape, cart, cannabis, marijuana, straw... [mg thc gummies cherry raspberry strawberry fl...
2 1 2651 1_delivery_shipment_postage_delivered [delivery, shipment, postage, delivered, posta... [package say delivered po box never got, mail ...
3 2 2204 2_vendor_vendors_vendorbbmc_trusted [vendor, vendors, vendorbbmc, trusted, supplie... [im looking vendor named buths bhuts shipping ...
4 3 1951 3_guy_post_community_sub [guy, post, community, sub, idea, posting, nig... [sup fam ya boy ng min anybody legit right, hi...
5 4 1715 4_empiremarket_empire_empiredealer_empireteam [empiremarket, empire, empiredealer, empiretea... [empire vendor, give me empire, top empire ven...
6 5 1694 5_dreammarket_market_markets_nightmaremarket [dreammarket, market, markets, nightmaremarket... [miss dream ca nt use wallstreet market, wall ...
7 6 1550 6_scamming_scammer_scam_scammers [scamming, scammer, scam, scammers, scams, sca... [cottageindustry possibly exit scamming select...
8 7 1411 7_darkweb_sentenced_darknetmarketsnoobs_darkne... [darkweb, sentenced, darknetmarketsnoobs, dark... [darkweb vendor happytimes sentenced five year...
9 8 1390 8_coca_cocain_cocacolacompany_cocainehcl [coca, cocain, cocacolacompany, cocainehcl, co... [review high purity colombian coke brazil, ful...
10 9 1240 9_xanaxlabs_xanaxlife_xanax_xanaxcartel [xanaxlabs, xanaxlife, xanax, xanaxcartel, xan... [frankie new vendor mg real alprazolam xanax b...
11 10 1130 10_mdma_mdmaus_mda_mdmamaster [mdma, mdmaus, mda, mdmamaster, mdmamphetamine... [per gram high purity mda promotion active sel...
12 11 997 11_lsd_shrooms_tab_acid [lsd, shrooms, tab, acid, tabs, psychedelic, m... [point one lsd blotters lsd tab void realm tea...
13 12 911 12_det_dere_igjen_en [det, dere, igjen, en, privnote, kan, esrar, s... [lever fortsatt valhalla noen som har en invit...
14 13 813 13_monero_moneroatms_wallet_coin [monero, moneroatms, wallet, coin, bitcoin, cr... [buying coin anonymously needed monero, noob n...
15 14 799 14_mastercard_card_usacards_cards [mastercard, card, usacards, cards, carding, p... [buying prepaid debit card btc eu, online card...
16 15 687 15_tor_torguard_torbox_vpn [tor, torguard, torbox, vpn, torstreet, vpns, ... [configure tor browser disable javascript, use...
17 16 659 16_crosspost_crossposting_goingpostal_crosspdf [crosspost, crossposting, goingpostal, crosspd... [lesson learnd googleplex saga prolific bar de...
18 17 658 17_review_reviews_reviewer_reviewing [review, reviews, reviewer, reviewing, reviewe... [xpost danknation vendor review sunaero multis...
19 18 636 18_marketplace_coremarket_market_markets [marketplace, coremarket, market, markets, non... [none marketplace link working, currently work...
20 19 584 19_moderator_mod_dispute_disputee [moderator, mod, dispute, disputee, disputers,... [mod admin help dispute, dispute moderator ple...
21 20 569 20_pgpkey_pgp_pgps_pg [pgpkey, pgp, pgps, pg, key, gnupg, gpg, keys,... [pgp public key, market pgp key, pgp key]
22 21 568 21_deposit_depositing_deposits_deposited [deposit, depositing, deposits, deposited, add... [btc deposit issue ticket, missing deposit dou...
23 22 539 22_passport_passports_fakeid_certificate [passport, passports, fakeid, certificate, for... [photoshop documents fakeid photo id address p...
24 23 478 23_cryptonia_cryptoniausers_cryptonians_cryptn... [cryptonia, cryptoniausers, cryptonians, crypt... [cryptonia market, market king samsara crypton...
25 24 468 24_wsm_wkr_whita_terpwax [wsm, wkr, whita, terpwax, whachu, wowza, gree... [back me wsm, wsm back, wsm vendor]
26 25 447 25_bunk_bar_bars_selaminy [bunk, bar, bars, selaminy, thegeniusbar, bars... [selaminy bar review, bunk pack selaminy, sela...
27 26 444 26_meth_methbusters_methamphetamine_methamph [meth, methbusters, methamphetamine, methamph,... [looking crystal meth, crystal meth uk, crysta...
28 27 443 27_ketamine_ketamin_ketamineking_ketaminekings [ketamine, ketamin, ketamineking, ketamineking... [ketamine uk vendor, review ketamine, ketamine...
29 28 438 28_ticket_ticketmaster_ticketing_ticketw [ticket, ticketmaster, ticketing, ticketw, tic... [support ticket open month, support ticket tic...
30 29 416 29_counterfeitmoney_counterfeit_counterfeits_c... [counterfeitmoney, counterfeit, counterfeits, ... [find best usd counterfeit note, best counterf...
31 30 415 30_login_logins_password_authentication [login, logins, password, authentication, mult... [password login disabled, login problem fa err...
32 31 409 31_ecstasy_ecstasydata_pill_pillsexpress [ecstasy, ecstasydata, pill, pillsexpress, pil... [best ecstasy pill, samsung mg ecstasy pills u...
33 32 409 32_hacking_hacker_hackerforhire_hackers [hacking, hacker, hackerforhire, hackers, hack... [job btc hacking service needed, looking profe...
34 33 401 33_adderall_adderalls_adderal_adderallz [adderall, adderalls, adderal, adderallz, adde... [back mg adderall ir straight pharmacy brand n...
35 34 392 34_tails_tail_wallet_monero [tails, tail, wallet, monero, electrum, electr... [electrum tail personal monero wallet, tails e...
36 35 376 35_mushrooms_mushroommafia_mushroom_mushroomchick [mushrooms, mushroommafia, mushroom, mushroomc... [mushcanada free sample grams psilocybe cubens...
37 36 369 36_xmr_xmrs_btc_lfwxmr [xmr, xmrs, btc, lfwxmr, xmrto, btcoin, xmrtop... [xmr btc empire, btc xmr, xmr btc xmr xmr]
38 37 349 37_drugmarket_drugpics_drugs_drugsource [drugmarket, drugpics, drugs, drugsource, drug... [energy control international use and abuse of...
39 38 344 38_dread_dreade_dreaddit_dreaddits [dread, dreade, dreaddit, dreaddits, dreadadve... [new dread since dream, dread back, dread well]
40 39 315 39_withdraw_withdrawling_withdrawing_withdrawled [withdraw, withdrawling, withdrawing, withdraw... [made withdraw btc, withdrawal working stuck p...
41 40 311 40_escrow_escrows_payment_multisignature [escrow, escrows, payment, multisignature, mar... [escrow, much escrow, full escrow]
42 41 302 41_heroin_opium_heroinreview_heroinfactory [heroin, opium, heroinreview, heroinfactory, h... [liquidgold afghan burmese heroin sale extende...
43 42 300 42_oxycodone_oxycocodone_oxicodone_oxycodon [oxycodone, oxycocodone, oxicodone, oxycodon, ... [mg oxycodone instant release supeudol origina...
44 43 285 43_dnm_dmn_dnms_dnmrelated [dnm, dmn, dnms, dnmrelated, dm, dwm, dnmsuper... [dnm avenger link, new dnm order, call dnm ven...
45 44 266 44_paypal_paypalshow_paypals_transfers [paypal, paypalshow, paypals, transfers, trans... [looking legit website bank western union payp...
46 45 262 45_ddos_ddosd_attacks_attack [ddos, ddosd, attacks, attack, ddosed, attacke... [new ddos attack, attack ddos, ddos attack]
47 46 252 46_fraud_fraudsters_fraudster_frauding [fraud, fraudsters, fraudster, frauding, fraud... [new fraud vendor, fraud vendor, fraud]
48 47 247 47_benzoblotters_benzobuddies_benzos_benzo [benzoblotters, benzobuddies, benzos, benzo, b... [czech republic worldwide discreetlab selling ...
49 48 231 48_apollonmarket_apollon_market_apollo [apollonmarket, apollon, market, apollo, myste... [mysteryland apollon market big promotion deal...
50 49 230 49_phishing_phising_phish_phishy [phishing, phising, phish, phishy, phissing, p... [phishing warning, phishing link, warning empi...
51 50 217 50_opsec_opsexy_opec_opspec [opsec, opsexy, opec, opspec, opsecaholic, net... [dream opsec, opsec, opsec question]
52 51 213 51_mirror_mirrors_reflection_links [mirror, mirrors, reflection, links, url, link... [mirror link working, anyone working mirror li...
53 52 212 52_links_link_pm_works [links, link, pm, works, need, url, send, work... [please pm someone working link, someone pm wo...
54 53 207 53_fentanyl_fentantyl_carfentanyl_carfentanil [fentanyl, fentantyl, carfentanyl, carfentanil... [furanyl fentanyl fentanyl analogue eu, lookin...
55 54 203 54_cgmc_invitation_ggmc_invite [cgmc, invitation, ggmc, invite, cmc, gcmc, co... [need cgmc invite code, cgmc invite code, invi...
56 55 202 55_cvv_cvvs_ccv_cvvbilling [cvv, cvvs, ccv, cvvbilling, cmv, ccs, vcc, cc... [looking trusted cc cvv vendor, uk cc cvv vend...
In [7]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6660425662994385
Davies_bouldin_score: 0.3869296287979983
In [ ]:
topic_model.reduce_topics(tc1.corpus, nr_topics='auto')
topics = topic_model.topics_
In [9]:
topic_model.get_topic_info()
Out[9]:
Topic Count Name Representation Representative_Docs
0 -1 23926 -1_heinekenexpress_dmt_opiateconnect_tramadol [heinekenexpress, dmt, opiateconnect, tramadol... [good source dmt dream market, need know start...
1 0 11635 0_cannabis_sale_edibles_price [cannabis, sale, edibles, price, weed, shippin... [adderall mg pharma gram aaa indoor nugs ounce...
2 1 4393 1_scamming_scammer_scam_scammed [scamming, scammer, scam, scammed, scams, phis... [sale customer scamming alert vendor, partysqu...
3 2 2651 2_delivery_package_shipment_postage [delivery, package, shipment, postage, shippin... [package marked delivered never arrived, packa...
4 3 1951 3_post_posting_advice_community [post, posting, advice, community, newbie, que... [sup fam ya boy ng min anybody legit right, hi...
5 4 1715 4_empire_empiremarket_empiredealer_empireteam [empire, empiremarket, empiredealer, empiretea... [empire deposit support, empire now back, empi...
6 5 1694 5_dreammarket_dream_dreams_dreaming [dreammarket, dream, dreams, dreaming, nightma... [new wall st use dream quick question, dream m...
7 6 1411 6_darkweb_darkbay_darknetmarkets_sentenced [darkweb, darkbay, darknetmarkets, sentenced, ... [father son sentenced prison selling drugs dar...
8 7 1130 7_mdma_mdmamaster_mdmaus_mda [mdma, mdmamaster, mdmaus, mda, mdmamphetamine... [best domestic mdma mda fast shipping tracked ...
9 8 911 8_det_dere_je_nede [det, dere, je, nede, noen, du, igjen, vous, s... [hejlpe til finne ut av hva jeg har mottatt, z...
10 9 813 9_monero_moneroatms_wallet_bitcoin [monero, moneroatms, wallet, bitcoin, crypto, ... [monero btc, noob need help buying bitcoin mon...
11 10 799 10_mastercard_card_carder_carding [mastercard, card, carder, carding, cards, car... [credit score balance hq debit card fullz appl...
12 11 687 11_tor_torguard_vpn_torbox [tor, torguard, vpn, torbox, vpns, torshops, t... [really safe using tor vpn, use vpn tor tails,...
13 12 659 12_crosspost_goingpostal_posted_marketplace [crosspost, goingpostal, posted, marketplace, ... [someone posted witchman account crosspost, ma...
14 13 658 13_review_reviewing_reviews_reviewed [review, reviewing, reviews, reviewed, reviewf... [review please, xpost danknation vendor review...
15 14 636 14_coremarket_marketplace_markets_market [coremarket, marketplace, markets, market, non... [none marketplace link working, core marketpla...
16 15 584 15_dispute_moderator_disputes_disputers [dispute, moderator, disputes, disputers, mod,... [moderator please help dispute, dispute modera...
17 16 569 16_pgpkey_pgp_pgps_pg [pgpkey, pgp, pgps, pg, key, gnupg, keys, gpg,... [find pgp key, pgp key, vendor pgp key]
18 17 568 17_deposit_depositing_deposits_deposited [deposit, depositing, deposits, deposited, btc... [generated deposit address deposited multiple ...
19 18 539 18_passport_passports_fakeid_certificate [passport, passports, fakeid, certificate, for... [photoshop documents fakeid photo id address p...
20 19 478 19_cryptonia_cryptonians_cryptoniausers_cryptn... [cryptonia, cryptonians, cryptoniausers, crypt... [cryptonia already, everyone move cryptonia ma...
21 20 468 20_wsm_wsms_wkr_wxtra [wsm, wsms, wkr, wxtra, whita, terpwax, whachu... [back me wsm, wsm back, wsm vendor]
22 21 447 21_bunk_bars_bar_barsbaby [bunk, bars, bar, barsbaby, lonestarbars, theg... [bunk pack selaminy, bunk bar, selaminy hulk b...
23 22 443 22_ketamine_ketamineking_ketamin_ketaminekings [ketamine, ketamineking, ketamin, ketamineking... [review ketamine, ketamine review, ketamine us]
24 23 438 23_ticket_ticketmaster_tickets_support [ticket, ticketmaster, tickets, support, conce... [support ticket support ticket, support ticket...
25 24 416 24_counterfeit_counterfeitmoney_counterfeits_c... [counterfeit, counterfeitmoney, counterfeits, ... [counterfeit note, find best usd counterfeit n...
26 25 415 25_login_logins_password_authentication [login, logins, password, authentication, logg... [password changed lost ca nt log, login proble...
27 26 392 26_tails_tail_electrum_electrumtails [tails, tail, electrum, electrumtails, electru... [updated tail electrum issue setting gui moner...
28 27 376 27_mushrooms_mushroommafia_mushroom_shrooms [mushrooms, mushroommafia, mushroom, shrooms, ... [mushcanada free sample grams psilocybe cubens...
29 28 369 28_xmr_xmrs_lfwxmr_xmrto [xmr, xmrs, lfwxmr, xmrto, btc, xmrtopy, xanxa... [btc xmr, xmr btc, xmr btc xmr xmr]
30 29 344 29_dread_dreaddit_dreaddits_dreadonion [dread, dreaddit, dreaddits, dreadonion, dread... [dread back, anything dread, dread well]
31 30 315 30_withdraw_withdrawling_withdrawl_withdrawing [withdraw, withdrawling, withdrawl, withdrawin... [withdraw problem pending withdraw hour, withd...
32 31 311 31_escrow_escrows_marketplace_payment [escrow, escrows, marketplace, payment, commis... [full escrow, escrow first, multisig escrow qu...
33 32 285 32_dnm_dmn_dnms_dnmrelated [dnm, dmn, dnms, dnmrelated, dm, dwm, dnmarket... [new dnm first order question, dnm avenger lin...
34 33 266 33_paypal_paypalshow_paypals_transfers [paypal, paypalshow, paypals, transfers, trans... [looking legit website bank western union payp...
35 34 262 34_ddos_ddosd_attacks_attack [ddos, ddosd, attacks, attack, ddosed, attacke... [anything new nightmare ddos attack, ddos atta...
36 35 252 35_fraud_fraudsters_fraudster_frauding [fraud, fraudsters, fraudster, frauding, fraud... [new fraud vendor, fraud vendor, fraud]
37 36 247 36_benzoblotters_benzobuddies_benzos_benzo [benzoblotters, benzobuddies, benzos, benzo, b... [czech republic worldwide discreetlab selling ...
38 37 231 37_apollonmarket_apollon_market_apollo [apollonmarket, apollon, market, apollo, myste... [mysteryland apollon market big promotion deal...
39 38 217 38_opsec_opsexy_opspec_opec [opsec, opsexy, opspec, opec, opsecaholic, net... [dream opsec, opsec question, opsec]
40 39 213 39_mirror_mirrors_empire_reflection [mirror, mirrors, empire, reflection, working,... [empire mirror working, anyone working mirror ...
41 40 212 40_links_link_pm_works [links, link, pm, works, need, url, working, s... [working link please pm, please pm someone wor...
42 41 203 41_cgmc_invitation_ggmc_invite [cgmc, invitation, ggmc, invite, cmc, gcmc, co... [need cgmc invite code, invite code cgmc, cgmc...
In [10]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.34653472900390625
Davies_bouldin_score: 0.7209094786047956
In [11]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [12]:
topic_model.visualize_topics()

image.png

In [13]:
topic_model.visualize_hierarchy()

image.png

In [14]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

image.png

In [16]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.6)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-28 14:20:51,371 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
Out[16]:
Topic Count Name Representation Representative_Docs
0 -1 23556 -1_vendor_anyone_review_new [vendor, anyone, review, new, account, order, ... [good source dmt dream market, need know start...
1 0 11636 0_weed_xanax_lsd_review [weed, xanax, lsd, review, cocaine, mg, vendor... [adderall mg pharma gram aaa indoor nugs ounce...
2 1 4422 1_vendor_scammer_scam_scamming [vendor, scammer, scam, scamming, exit, phishi... [sale customer scamming alert vendor, partysqu...
3 2 2655 2_order_shipping_pack_package [order, shipping, pack, package, delivery, shi... [package marked delivered never arrived, packa...
4 3 1952 3_help_guy_need_back [help, guy, need, back, day, time, question, a... [sup fam ya boy ng min anybody legit right, hi...
5 4 1734 4_empire_empire market_empire empire_market [empire, empire market, empire empire, market,... [empire deposit support, empire now back, empi...
6 5 1696 5_dream_dream market_nightmare_market [dream, dream market, nightmare, market, walls... [new wall st use dream quick question, dream m...
7 6 1411 6_darknet_dark_web_dark web [darknet, dark, web, dark web, darkfail, sente... [father son sentenced prison selling drugs dar...
8 7 1151 7_mdma_mdma vendor_mda_usa [mdma, mdma vendor, mda, usa, sale, mdma revie... [best domestic mdma mda fast shipping tracked ...
9 8 911 8_anyone_heard_happened_de [anyone, heard, happened, de, anyone heard, ha... [hejlpe til finne ut av hva jeg har mottatt, z...
10 9 824 9_monero_bitcoin_btc_coin [monero, bitcoin, btc, coin, wallet, crypto, b... [monero btc, noob need help buying bitcoin mon...
11 10 816 10_carding_card_credit_credit card [carding, card, credit, credit card, debit, pr... [credit score balance hq debit card fullz appl...
12 11 687 11_onion_tor_vpn_javascript [onion, tor, vpn, javascript, browser, tor bro... [really safe using tor vpn, use vpn tor tails,...
13 12 683 12_crosspost_review crosspost_giveaway_review [crosspost, review crosspost, giveaway, review... [someone posted witchman account crosspost, ma...
14 13 706 13_review_vendor review_review vendor_vendor [review, vendor review, review vendor, vendor,... [review please, xpost danknation vendor review...
15 14 697 14_market_market market_new market_marketplace [market, market market, new market, marketplac... [none marketplace link working, core marketpla...
16 15 585 15_dispute_moderator_mod_dispute dispute [dispute, moderator, mod, dispute dispute, ple... [moderator please help dispute, dispute modera...
17 16 573 16_pgp_key_pgp key_public [pgp, key, pgp key, public, public pgp, lost, ... [find pgp key, pgp key, vendor pgp key]
18 17 578 17_deposit_deposited_address_btc [deposit, deposited, address, btc, btc deposit... [generated deposit address deposited multiple ...
19 18 540 18_id_fake_passport_fake id [id, fake, passport, fake id, license, scan, d... [photoshop documents fakeid photo id address p...
20 19 482 19_cryptonia_samsara_samsara market_cryptonia ... [cryptonia, samsara, samsara market, cryptonia... [cryptonia already, everyone move cryptonia ma...
21 20 485 20_wsm_wsm wsm_wsm vendor_vendor wsm [wsm, wsm wsm, wsm vendor, vendor wsm, vendor,... [back me wsm, wsm back, wsm vendor]
22 21 449 21_bar_bunk_selaminy_bars [bar, bunk, selaminy, bars, hulk, bunk bar, th... [bunk pack selaminy, bunk bar, selaminy hulk b...
23 22 445 22_ketamine_ketamine vendor_ketamine review_re... [ketamine, ketamine vendor, ketamine review, r... [review ketamine, ketamine review, ketamine us]
24 23 440 23_ticket_support ticket_support_please [ticket, support ticket, support, please, mont... [support ticket support ticket, support ticket...
25 24 430 24_counterfeit_euro_note_counterfeit euro [counterfeit, euro, note, counterfeit euro, co... [counterfeit note, find best usd counterfeit n...
26 25 418 25_login_account_password_log [login, account, password, log, fa, error, ca ... [password changed lost ca nt log, login proble...
27 26 393 26_tails_tail_electrum_wallet [tails, tail, electrum, wallet, whonix, monero... [updated tail electrum issue setting gui moner...
28 27 377 27_mushroom_shrooms_mushrooms_magic [mushroom, shrooms, mushrooms, magic, cubensis... [mushcanada free sample grams psilocybe cubens...
29 28 379 28_xmr_btc xmr_btc_xmrto [xmr, btc xmr, btc, xmrto, xmr btc, xmr deposi... [btc xmr, xmr btc, xmr btc xmr xmr]
30 29 348 29_dread_dread dread_sub dread_new dread [dread, dread dread, sub dread, new dread, sub... [dread back, anything dread, dread well]
31 30 325 30_withdraw_withdrawal_withdrawl_working [withdraw, withdrawal, withdrawl, working, btc... [withdraw problem pending withdraw hour, withd...
32 31 320 31_escrow_multisig_full escrow_extend [escrow, multisig, full escrow, extend, extend... [full escrow, escrow first, multisig escrow qu...
33 32 290 32_dnm_dnms_dn_dnstars [dnm, dnms, dn, dnstars, dnmuk, avenger, dm, d... [new dnm first order question, dnm avenger lin...
34 33 271 33_paypal_transfer_paypal transfer_paypal account [paypal, transfer, paypal transfer, paypal acc... [looking legit website bank western union payp...
35 34 264 34_ddos_ddos attack_attack_ddos ddos [ddos, ddos attack, attack, ddos ddos, market,... [anything new nightmare ddos attack, ddos atta...
36 35 262 35_fraud_fraudsters_fraud vendor_loan fraud [fraud, fraudsters, fraud vendor, loan fraud, ... [new fraud vendor, fraud vendor, fraud]
37 36 256 36_benzos_benzo_rc_benzo vendor [benzos, benzo, rc, benzo vendor, rc benzos, r... [czech republic worldwide discreetlab selling ...
38 37 232 37_apollon_apollon market_market_mysteryland [apollon, apollon market, market, mysteryland,... [mysteryland apollon market big promotion deal...
39 38 219 38_opsec_opsec question_opsec opsec_question [opsec, opsec question, opsec opsec, question,... [dream opsec, opsec question, opsec]
40 39 215 39_mirror_working mirror_working_mirror link [mirror, working mirror, working, mirror link,... [empire mirror working, anyone working mirror ...
41 40 213 40_link_working link_working_pm [link, working link, working, pm, link please,... [working link please pm, please pm someone wor...
42 41 203 41_cgmc_invite_invite code_code [cgmc, invite, invite code, code, cgmc invite,... [need cgmc invite code, invite code cgmc, cgmc...
In [ ]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
In [ ]:
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
dict_zero_shots_2 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.2)
In [29]:
dict_zero_shots_25[1] = 'phishing - scamming'
dict_zero_shots_25[12] = 'crosspost vendor'
dict_zero_shots_25[19] = 'cryphtonia market'
dict_zero_shots_25[20] = 'wsm market'
dict_zero_shots_25[21] = 'bunk bar'
dict_zero_shots_25[31] = 'escrow service'
dict_zero_shots_25[39] = 'mirror link'
dict_zero_shots_25[40] = 'link'
dict_zero_shots_25[41] = 'cmgc - invite' 
In [30]:
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_200/zero_shot_025.csv', index=False)
pd.DataFrame(list(dict_zero_shots_2.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_200/zero_shot_020.csv', index=False)
In [31]:
topic_model.set_topic_labels(dict_zero_shots_25)
In [32]:
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True, custom_labels=True)

image.png

In [33]:
topic_model.visualize_barchart(top_n_topics=42, custom_labels=True, n_words=10)

image.png

In [34]:
topic_model.visualize_topics()

image.png

In [35]:
topic_model.visualize_hierarchy(custom_labels=True)

image.png

In [18]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.34653472900390625
Davies_bouldin_score: 0.7209094786047956
In [24]:
topic_words = topic_model.get_topics()
topics = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in tc1.corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.5765415205607421
In [36]:
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
Out[36]:
65529
In [37]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:25,  1.72s/it]

image.png

In [39]:
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
print(results_final.shape)
results_final.head()
(41973, 10)
Out[39]:
Document Embedding Topic Probability Created_on Count Name CustomName Representation Representative_Docs
0 retirement sale one last blowout mdma dry spee... [-0.00200396, 0.060752388, 0.00081512495, -0.0... 7 0.393620 2020-01-09 1151 7_mdma_mdma vendor_mda_usa mdma - reviews vendor [mdma, mdma vendor, mda, usa, sale, mdma revie... [best domestic mdma mda fast shipping tracked ...
1 cash deposit [-0.0044404618, 0.016640304, -0.035438363, 0.0... 17 0.539291 2019-11-06 578 17_deposit_deposited_address_btc deposit - address - deposited [deposit, deposited, address, btc, btc deposit... [generated deposit address deposited multiple ...
2 import meth contact tracking [-0.05514505, -0.042183764, -0.060674116, -0.0... 0 1.000000 2020-01-09 11636 0_weed_xanax_lsd_review xanax - lsd - weed [weed, xanax, lsd, review, cocaine, mg, vendor... [adderall mg pharma gram aaa indoor nugs ounce...
3 please need working links [0.013639548, -0.030973928, -0.05787297, 0.026... 40 1.000000 2020-01-09 213 40_link_working link_working_pm link [link, working link, working, pm, link please,... [working link please pm, please pm someone wor...
4 reliable dexedrine vendor [-0.09150407, -0.024179617, 0.027147656, -0.06... 0 0.404354 2020-01-09 11636 0_weed_xanax_lsd_review xanax - lsd - weed [weed, xanax, lsd, review, cocaine, mg, vendor... [adderall mg pharma gram aaa indoor nugs ounce...
In [49]:
plt.figure(figsize=(10, 5))
sns.countplot(results_final, x='Topic', orient='h');
No description has been provided for this image
In [50]:
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_200", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
2024-06-28 16:03:43,246 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.
In [52]:
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_200_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=model)
In [53]:
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_200.parquet')
In [6]:
import nbconvert

!jupyter nbconvert --to html show_results.ipynb

20n 150 all-MiniLM-L6-v2¶

In [117]:
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=20, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    top_n_words=10, 
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model=[mmr, kw],
    embedding_model=model,
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
2024-06-30 15:40:03,123 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-06-30 15:42:22,877 - BERTopic - Dimensionality - Completed ✓
2024-06-30 15:42:22,905 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-06-30 15:43:23,274 - BERTopic - Cluster - Completed ✓
2024-06-30 15:43:23,475 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-06-30 15:44:03,645 - BERTopic - Representation - Completed ✓
In [118]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [119]:
topic_model.get_topic_info()
Out[119]:
Topic Count Name Representation Representative_Docs
0 -1 24995 -1_vendor_review_market_new [vendor, review, market, new, order, account, ... [review vendor cdnven product china white synt...
1 0 4998 0_weed_cannabis_cart_thc [weed, cannabis, cart, thc, review, oz, hash, ... [product vendor review ml lemon kush wax vape ...
2 1 1948 1_help_need_guy_sub [help, need, guy, sub, day, question, post, ad... [hi guy update post, way make sub like old red...
3 2 1845 2_order_shipping_package_pack [order, shipping, package, pack, delivery, shi... [package show informed delivery, usps informed...
4 3 1728 3_empire_empire market_empire empire_market [empire, empire market, empire empire, market,... [empire back, empire market back, empire suppo...
... ... ... ... ... ...
64 63 191 63_dmt_dmt vendor_odsmt_dmt dmt [dmt, dmt vendor, odsmt, dmt dmt, bluefairy, c... [best dmt vendor, dmt, vendor dmt]
65 64 186 64_captcha_rapture_rapture market_captcha captcha [captcha, rapture, rapture market, captcha cap... [captcha, use rapture registration login captc...
66 65 181 65_chemical_research_research chemical_chems [chemical, research, research chemical, chems,... [chem theory honorable research chemical suppl...
67 66 166 66_tor_browser_tor browser_tor network [tor, browser, tor browser, tor network, netwo... [tor browser help, review tor browser, tor bro...
68 67 159 67_mephedrone_meopcp_mxe_mescaline [mephedrone, meopcp, mxe, mescaline, mmc, meph... [eurovalz new stock list mxe mephedrone availa...

69 rows × 5 columns

In [120]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6823439598083496
Davies_bouldin_score: 0.3791311194398217
In [136]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.46)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-30 16:01:57,452 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
Out[136]:
Topic Count Name Representation Representative_Docs
0 -1 18900 -1_anyone_get_update_review [anyone, get, update, review, new, order, acco... [review vendor cdnven product china white synt...
1 0 5021 0_weed_cannabis_cart_thc [weed, cannabis, cart, thc, review, hash, shat... [product vendor review ml lemon kush wax vape ...
2 1 2035 1_help_guy_sub_need [help, guy, sub, need, back, question, day, ad... [hi guy update post, way make sub like old red...
3 2 2005 2_order_shipping_package_pack [order, shipping, package, pack, delivery, shi... [package show informed delivery, usps informed...
4 3 1815 3_empire_empire market_empire empire_market [empire, empire market, empire empire, market,... [empire back, empire market back, empire suppo...
... ... ... ... ... ...
64 63 202 63_dmt_dmt vendor_dmt vape_odsmt [dmt, dmt vendor, dmt vape, odsmt, dmt dmt, ch... [best dmt vendor, dmt, vendor dmt]
65 64 187 64_captcha_rapture_rapture market_captcha captcha [captcha, rapture, rapture market, captcha cap... [captcha, use rapture registration login captc...
66 65 198 65_chemical_research_research chemical_chems [chemical, research, research chemical, chems,... [chem theory honorable research chemical suppl...
67 66 222 66_tor_tor browser_browser_tor network [tor, tor browser, browser, tor network, netwo... [tor browser help, review tor browser, tor bro...
68 67 178 67_mephedrone_meopcp_mxe_mescaline [mephedrone, meopcp, mxe, mescaline, mmc, meph... [eurovalz new stock list mxe mephedrone availa...

69 rows × 5 columns

In [137]:
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
Topic 0:
[('weed', 0.034440655421142205), ('cannabis', 0.02163153800597487), ('cart', 0.02131043599512726), ('thc', 0.01824969706580151), ('review', 0.014699009276947663), ('hash', 0.013941473812216146), ('shatter', 0.01385956329618397), ('distillate', 0.013788654857474974), ('oz', 0.013699156416790433), ('bud', 0.013511109713200431)]
Topic 1:
[('help', 0.020902783340739296), ('guy', 0.018141447515630817), ('sub', 0.018061502588594965), ('need', 0.017228855503283312), ('back', 0.015478023601901827), ('question', 0.014588537674805049), ('day', 0.014103368936720316), ('advice', 0.013364282528612713), ('shit', 0.01315161003792278), ('post', 0.013047995251142981)]
Topic 2:
[('order', 0.05141015231966171), ('shipping', 0.049459163735666216), ('package', 0.034053136968648846), ('pack', 0.03022209818441062), ('delivery', 0.02795081519396416), ('shipped', 0.02329856657883124), ('ship', 0.018460022471923705), ('usps', 0.01791841479902273), ('delivered', 0.013805279333056695), ('international', 0.013489901890926527)]
Topic 3:
[('empire', 0.1408905571475779), ('empire market', 0.047832772840589455), ('empire empire', 0.031829806331982104), ('market', 0.022805110300898157), ('deposit', 0.013311333177807824), ('vendor empire', 0.013156717730784043), ('market empire', 0.011089067660616526), ('ticket', 0.01074427553045342), ('empire vendor', 0.010698545017112069), ('link', 0.010681298135100263)]
Topic 4:
[('vendor', 0.07352353126224266), ('vendor vendor', 0.027793634237021102), ('vendor inquiry', 0.01767525614277819), ('inquiry', 0.017541981259577955), ('new vendor', 0.016785192087489506), ('new', 0.014829319198509949), ('looking', 0.012972691707067285), ('trusted', 0.01072558646684358), ('looking vendor', 0.010710900710690481), ('legit', 0.009197289255501064)]
Topic 5:
[('scammer', 0.09029491748260987), ('scam', 0.08790776767015127), ('exit', 0.05409099855154675), ('scamming', 0.05345669284651443), ('scammed', 0.04387510350103406), ('exit scam', 0.03795962281948466), ('selective', 0.026166507631807726), ('exit scamming', 0.02118137287516553), ('alert', 0.018586128265137065), ('warning', 0.016658284455618223)]
Topic 6:
[('darknet', 0.06574314894703666), ('dark', 0.05575058109762389), ('web', 0.04056458466112466), ('dark web', 0.03888520120543353), ('darkfail', 0.0326590485356795), ('darkweb', 0.022966993332759263), ('sentenced', 0.02278933896007672), ('drug', 0.01891860040084595), ('prison', 0.016267179639628225), ('dark net', 0.01581062364638907)]
Topic 7:
[('mdma', 0.15108386628760112), ('mda', 0.0429162477081238), ('mdma vendor', 0.034000537140433544), ('domestic', 0.0190830757862918), ('usa', 0.018739701436677143), ('us', 0.0185565173484347), ('mdma review', 0.016368695282717644), ('domestic mdma', 0.015310305078250312), ('sale', 0.01522592122356552), ('mda powder', 0.014536706585474362)]
Topic 8:
[('xanax', 0.14902915079054058), ('mg', 0.037073236799781265), ('diazepam', 0.030553350154839825), ('xanax vendor', 0.03030524528382461), ('valium', 0.02763017540016205), ('mg xanax', 0.024213410884215428), ('bar', 0.023464625343629694), ('xanax bar', 0.02288490630780396), ('xanax mg', 0.020390790169055237), ('bars', 0.01626062009535231)]
Topic 9:
[('lsd', 0.1734647507288589), ('ug', 0.04747007330538135), ('tab', 0.04142689902020762), ('lsd vendor', 0.04039618697559746), ('acid', 0.022771452107687707), ('lsd tab', 0.022084975534138424), ('gammagoblin', 0.020282120388494473), ('free', 0.018343364070410467), ('usa', 0.016690416345776463), ('mdma', 0.016209423140569452)]
Topic 10:
[('crosspost', 0.14470709618860572), ('giveaway', 0.04202963247681046), ('review crosspost', 0.02057311906846835), ('crosspost vendor', 0.01589126119057147), ('review', 0.015116522044644013), ('crosspost review', 0.01268557574967833), ('crosspost new', 0.010364520318510435), ('winner', 0.010310623869567707), ('vendor crosspost', 0.010105053266291379), ('envoy', 0.00986726589013657)]
Topic 11:
[('monero', 0.09836195098914442), ('btc', 0.06363293656791628), ('bitcoin', 0.05875105248934212), ('coin', 0.0400067415743657), ('wallet', 0.03272232748466129), ('crypto', 0.031144635815675798), ('buying', 0.023400549290578096), ('buy', 0.022725372561839517), ('way', 0.019920869234552413), ('anonymously', 0.017395228092262436)]
Topic 12:
[('carding', 0.13434099717006076), ('card', 0.12768798940084336), ('credit', 0.03905087805207301), ('credit card', 0.037218429781870184), ('debit', 0.031090671713331016), ('gift', 0.026719556198699018), ('debit card', 0.02623880953133958), ('gift card', 0.025296889224605497), ('prepaid', 0.02390841050419752), ('cards', 0.02234112624489392)]
Topic 13:
[('dream', 0.16188938874288553), ('dream market', 0.0769761528601036), ('nightmare', 0.057569364500263495), ('market', 0.04348747836116363), ('dream dream', 0.03160064455881955), ('nightmare market', 0.015907385449258748), ('market dream', 0.015385428217323891), ('dream nightmare', 0.015124657030832909), ('anyone', 0.013442507797909524), ('dream alt', 0.013284958959525854)]
Topic 14:
[('dispute', 0.19114992246810456), ('mod', 0.06456174249994076), ('moderator', 0.04459012311206817), ('dispute dispute', 0.03637392127596315), ('please', 0.033520993517153756), ('help', 0.02470584993561166), ('help dispute', 0.021362067564137565), ('resolved', 0.020927566860518908), ('admin', 0.020875348152861174), ('dispute vendor', 0.020570879876576915)]
Topic 15:
[('cocaine', 0.19185759141507205), ('cocaine vendor', 0.033820265777428325), ('fishscale', 0.03184970819993608), ('peruvian', 0.02886221748054375), ('colombian', 0.025258187463892433), ('uncut', 0.02489998326649046), ('pure', 0.02338967723120477), ('fishscale cocaine', 0.02328527599852838), ('ukwhite', 0.02053657782269459), ('cocaine review', 0.019696278673770187)]
Topic 16:
[('review', 0.13479635045612737), ('vendor review', 0.09882182412820924), ('review vendor', 0.056146396360581954), ('vendor', 0.042324514725933904), ('review review', 0.026492179649533248), ('feedback', 0.018191404917665587), ('reviews', 0.017732395258446845), ('review template', 0.012447521444075921), ('sample', 0.012252829983770771), ('template', 0.010912428935692265)]
Topic 17:
[('market', 0.13229049891457828), ('market market', 0.02912990577253405), ('new market', 0.02733587453280759), ('markets', 0.018260042378394587), ('marketplace', 0.018237246649367742), ('new', 0.01653915451535478), ('core', 0.01391739463236357), ('grey market', 0.012662397716895505), ('core market', 0.012656085780194134), ('grey', 0.01194181520974549)]
Topic 18:
[('pgp', 0.2106338648158133), ('key', 0.13747067284852693), ('pgp key', 0.11633357860089025), ('public', 0.03390884077311098), ('public pgp', 0.029752434238696095), ('message', 0.02634991063313772), ('lost', 0.02000184309357757), ('vendor pgp', 0.01994359114136673), ('decrypt', 0.01894570485870717), ('lost pgp', 0.018617781813871224)]
Topic 19:
[('deposit', 0.21545306050516058), ('deposited', 0.058649535773789216), ('ticket', 0.04179694434525314), ('address', 0.03870905816314492), ('double', 0.03846549616753392), ('double deposit', 0.03667999808480363), ('deposit address', 0.03156203629282908), ('btc', 0.03071204186414341), ('btc deposit', 0.029610323600547078), ('deposit issue', 0.026879010967503388)]
Topic 20:
[('bar', 0.11981212043710264), ('bunk', 0.04275809433364427), ('bars', 0.04134049952006931), ('selaminy', 0.04065510251937409), ('hulk', 0.03808720407648988), ('xmf', 0.032625152153394094), ('xanmasterfrank', 0.024794638230597937), ('thebartender', 0.02146829100887259), ('bunk bar', 0.021104670137899736), ('pack', 0.02109839573855679)]
Topic 21:
[('oxycodone', 0.0915503419563695), ('mg', 0.0710126695185631), ('oxy', 0.06895163961328402), ('opiate', 0.045863554052605914), ('opiateconnect', 0.04453309489080648), ('oxycodone mg', 0.032572918296594006), ('opioids', 0.02865321440258756), ('oxycontin', 0.027684562577623886), ('morphine', 0.026473417513483474), ('mg mg', 0.022411097043153062)]
Topic 22:
[('id', 0.1385509849376173), ('passport', 0.08568772643406097), ('fake', 0.08482410071909122), ('fake id', 0.07918133670896313), ('license', 0.0649065274454038), ('scan', 0.047816586940883546), ('driver', 0.03623643897437548), ('driving', 0.03147360609206778), ('driver license', 0.029977190765821365), ('dl', 0.02565984346716325)]
Topic 23:
[('drug', 0.15474318847411217), ('drugsuk', 0.03997464930394203), ('drugs', 0.035767234032706756), ('selling drug', 0.013706118772916276), ('drug dealer', 0.01363616498515454), ('pharma', 0.011535993702901273), ('anyone', 0.011523457595840088), ('online', 0.011292374800951103), ('drug checking', 0.01121409717784059), ('drug market', 0.010871004656672078)]
Topic 24:
[('coke', 0.24480516496483845), ('coke vendor', 0.07247842001502117), ('best coke', 0.04428189107711554), ('uk coke', 0.0308130119571335), ('uk', 0.027295920632150066), ('good coke', 0.023893359833443418), ('best', 0.02301379158858306), ('coke review', 0.020957440344420427), ('domestic coke', 0.02013314071553494), ('cola', 0.018480674314440285)]
Topic 25:
[('pill', 0.13687679139191483), ('xtc', 0.0689429271115277), ('xtc pill', 0.05785884094585473), ('ecstasy', 0.049586539528066305), ('pills', 0.04890054928026709), ('mg', 0.03486535976865647), ('xtc pills', 0.025764845512285212), ('pill press', 0.024698668416445056), ('press', 0.02251824475187729), ('pillchills', 0.021667648535163505)]
Topic 26:
[('counterfeit', 0.18938590987283574), ('note', 0.09246239331422364), ('euro', 0.08637948498220155), ('money', 0.04200643406185359), ('counterfeit money', 0.03946683058559953), ('counterfeit euro', 0.038261536147981785), ('fake', 0.03268594490209314), ('bill', 0.03208617920470757), ('counterfeit note', 0.03202031699172969), ('currency', 0.030601930905961253)]
Topic 27:
[('ketamine', 0.27076491836153055), ('ketamine vendor', 0.05465820126593129), ('ketamine review', 0.034236075793543305), ('mdma ketamine', 0.026958811216225554), ('review ketamine', 0.023265028523820383), ('racemic', 0.021840213597887684), ('ketamine ketamine', 0.0209361890500582), ('review', 0.02035840901808624), ('usaconnect', 0.019703435870339864), ('domestic ketamine', 0.019540443113387658)]
Topic 28:
[('wsm', 0.26062962876168194), ('wsm wsm', 0.0405533967559985), ('wsm vendor', 0.03762271698792374), ('vendor wsm', 0.03206339997224917), ('wsm order', 0.017737920836218876), ('dream wsm', 0.0163160597194279), ('wsm exit', 0.016188508790288752), ('vendor', 0.015836552381021), ('order wsm', 0.015192151060626139), ('exit', 0.014335338467236433)]
Topic 29:
[('meth', 0.22276036172115224), ('crystal meth', 0.05328948275951995), ('crystal', 0.04915146873849497), ('meth vendor', 0.04802431132793452), ('methamphetamine', 0.03725379840439993), ('best meth', 0.026685331810131725), ('speed', 0.01638305099194758), ('vendor', 0.015045674869553384), ('meth review', 0.01449593744356139), ('meth speed', 0.013462093823955613)]
Topic 30:
[('ticket', 0.23191906538627902), ('support ticket', 0.13004355933812262), ('support', 0.11743036604119221), ('please', 0.045630976919213126), ('ticket support', 0.03775927767239459), ('month', 0.03625628502501229), ('response', 0.029074496373647844), ('help', 0.028793658434030697), ('ticket please', 0.027419804082700512), ('ticket ticket', 0.024956763887226712)]
Topic 31:
[('hacked', 0.07268774210946359), ('hacker', 0.05958968811663258), ('hacking', 0.05746148496961), ('job', 0.042647311606897144), ('lfw', 0.035572613614792414), ('malware', 0.03096845695427884), ('hack', 0.025379278779315104), ('exploit', 0.02510108738610593), ('account', 0.024285044626452958), ('developer', 0.022590978647495333)]
Topic 32:
[('login', 0.09281872111514107), ('account', 0.07744354819167176), ('password', 0.06405732928260777), ('log', 0.05534837243822272), ('fa', 0.053826774653849), ('error', 0.03207318072187401), ('registration', 0.026153382171231136), ('ca nt', 0.024842545357693684), ('nt', 0.024626489071347155), ('username', 0.02415273192532678)]
Topic 33:
[('adderall', 0.1768984328457466), ('mg', 0.06327189728144145), ('ir', 0.04289049261783819), ('ritalin', 0.042675061609971), ('vyvanse', 0.04215927084997145), ('mg adderall', 0.04139982170401677), ('adderall mg', 0.040035809578354675), ('pharmacy', 0.03187900435840742), ('adderall vendor', 0.030260415441556283), ('brand name', 0.029564246899279608)]
Topic 34:
[('xmr', 0.24399148967723583), ('btc xmr', 0.06444934680865366), ('btc', 0.060770387170328795), ('xmrto', 0.04594811650627081), ('xmr btc', 0.03659618863655146), ('xmr deposit', 0.029697291143319954), ('monero', 0.024426574198576537), ('monero xmr', 0.02388623803691428), ('xmr withdrawal', 0.022905677166382882), ('lfw', 0.020975478723098375)]
Topic 35:
[('tails', 0.16608161332689814), ('tail', 0.13811881990583533), ('electrum', 0.12793371341884016), ('wallet', 0.060172586573322195), ('whonix', 0.043956636853453926), ('monero', 0.042820101505499926), ('usb', 0.038269012567511224), ('electrum wallet', 0.03266818304438283), ('electrum tail', 0.02751289116948836), ('monero wallet', 0.026256935911540154)]
Topic 36:
[('mushroom', 0.13494669849792765), ('mushrooms', 0.09224942455638298), ('shrooms', 0.08697247444291728), ('magic', 0.07173853017427768), ('cubensis', 0.059415494503169554), ('magic mushrooms', 0.04196185835332625), ('psilocybin', 0.040879833586613595), ('psilocybe', 0.03648147963849479), ('magic mushroom', 0.03622256754377185), ('penis', 0.03389227020845581)]
Topic 37:
[('dread', 0.2681730554127759), ('dread dread', 0.0434925406301928), ('cafe dread', 0.038289226826336564), ('cafe', 0.03771246613377383), ('dread word', 0.03726941736665281), ('word day', 0.037067233457539685), ('word', 0.03431554512407786), ('sub dread', 0.02638917753177937), ('sub', 0.021469881583509116), ('new dread', 0.01950150523536743)]
Topic 38:
[('cc', 0.19850827602610305), ('cvv', 0.12559480815359578), ('vbv', 0.05156445823606583), ('cc vendor', 0.0390492104991615), ('cc cvv', 0.03541724139457509), ('non', 0.03209423751593194), ('non vbv', 0.031689110721461924), ('ccv', 0.03142631440888132), ('cvv vendor', 0.030653897772347494), ('fullz', 0.02565185109174636)]
Topic 39:
[('cryptonia', 0.270593329016721), ('cryptonia market', 0.05372997670742348), ('cryptonia cryptonia', 0.04063200874811663), ('dcdutchconnectionuk', 0.02698588870900908), ('empire cryptonia', 0.025352048121288082), ('market', 0.021908509868778566), ('dutyfreesmoking', 0.021779896231356913), ('vendor cryptonia', 0.021426552039488206), ('nightmare', 0.019568159691726875), ('cryptonia new', 0.015018539633818486)]
Topic 40:
[('withdraw', 0.16777560333243718), ('withdrawal', 0.13214115961261508), ('withdrawl', 0.04517713695493818), ('withdraws', 0.030940211999519157), ('btc', 0.030198882248887494), ('withdraw btc', 0.029522871625856834), ('working', 0.029302858049487304), ('pin', 0.02777645879969811), ('issue', 0.027607420425603244), ('withdraw pin', 0.026471975299199244)]
Topic 41:
[('escrow', 0.22451268938457392), ('multisig', 0.08174648178119914), ('escrow escrow', 0.028342657225423777), ('full escrow', 0.022926390507942773), ('escrow order', 0.021256992919067835), ('extend', 0.020684030739848886), ('extend escrow', 0.020634859815608034), ('order escrow', 0.020634859815608034), ('escrow service', 0.019024811620750936), ('full', 0.016810907663589756)]
Topic 42:
[('heroin', 0.24667865086560822), ('heroin vendor', 0.043298418940449716), ('afghan', 0.0370261380542706), ('afghan heroin', 0.03321404939224221), ('synthetic heroin', 0.027180379676850733), ('best heroin', 0.024176709214531092), ('synthetic', 0.023744583992494864), ('heroin sale', 0.02343861004012714), ('heroin review', 0.023225789058935086), ('ww', 0.02051867713165788)]
Topic 43:
[('de', 0.04104874164468111), ('har', 0.03004281267703729), ('noen', 0.025615388883690592), ('som', 0.023644974354175934), ('fra', 0.02234525177783555), ('en', 0.021807143998736837), ('la', 0.019481414592274055), ('para', 0.018155202455363964), ('mi', 0.017936341798979228), ('som har', 0.017123720458272217)]
Topic 44:
[('dnm', 0.17395559883217143), ('dnms', 0.04555790479945865), ('dn', 0.04298869175969707), ('bible', 0.03480993898853253), ('dnstars', 0.026858365160836242), ('dnmuk', 0.024792337071541142), ('dm', 0.023808743501494478), ('avenger', 0.019833539648394766), ('dnm vendor', 0.0145781416058812), ('vendor bible', 0.014363474603523505)]
Topic 45:
[('wallstreet', 0.17075397467717646), ('wall', 0.12652076561807357), ('wall street', 0.07815865540024333), ('street', 0.0744307008536918), ('wall st', 0.058824930764469485), ('st', 0.05389895713137546), ('street market', 0.05050436093268345), ('wallstreet market', 0.047109434439433946), ('wallstreetmarket', 0.0411780639775008), ('market', 0.03414270461323302)]
Topic 46:
[('ddos', 0.3105265840768657), ('ddos attack', 0.11291435352309753), ('attack', 0.10930939685702705), ('ddos ddos', 0.028760472115733474), ('ddos attacks', 0.026872070458530385), ('attacks', 0.025462720293905332), ('attack ddos', 0.024955566700335146), ('market ddos', 0.02300839257477511), ('market', 0.020031741575660723), ('ddos market', 0.01694829797415259)]
Topic 47:
[('paypal', 0.2908803867616197), ('transfer', 0.11757620933383377), ('paypal transfer', 0.08045981107451718), ('paypal account', 0.05635507976369717), ('western union', 0.04434216793462146), ('western', 0.0436696030188777), ('union', 0.04260904354730236), ('account', 0.0397607705908628), ('transfer paypal', 0.03064364170976873), ('venmo', 0.030317733806543675)]
Topic 48:
[('heard', 0.07747679742838355), ('happened', 0.049591151053306165), ('anyone', 0.04910471727518297), ('anyone heard', 0.0484273484083465), ('thewizzardnl', 0.03331435228064015), ('anybody heard', 0.03167116165532856), ('has anyone', 0.03119052964382607), ('has', 0.029099336350054232), ('anybody', 0.028548771954404018), ('therealrc', 0.028190854068417655)]
Topic 49:
[('benzos', 0.1469965293325937), ('benzo', 0.14387882009188516), ('rc', 0.04859228899225342), ('benzo vendor', 0.039310366559657056), ('rc benzos', 0.038034750391445375), ('benzobananas', 0.028228816077837456), ('rc benzo', 0.02735900406635033), ('benzoboys', 0.02658197245182248), ('best benzos', 0.02540478949018245), ('vendor benzos', 0.01997789146191072)]
Topic 50:
[('fraud', 0.25672448642865076), ('fraudsters', 0.037980752363923176), ('fraud vendor', 0.034182677127530856), ('loan fraud', 0.02683887738250451), ('fraudfox', 0.0262534840019881), ('loan', 0.024909941198737545), ('fraud fraud', 0.021471115793857264), ('fraud forum', 0.01962254027592412), ('uk fraud', 0.017738943947790978), ('best fraud', 0.017442258023043663)]
Topic 51:
[('dream', 0.1256863407223506), ('dream vendor', 0.0729529170933915), ('dream market', 0.05965219005493259), ('vendor dream', 0.04823192566238449), ('vendor', 0.04446878236261653), ('market', 0.04074878970532253), ('vendor inquiry', 0.0273713869493084), ('inquiry', 0.02705502158368874), ('nightmare market', 0.02683006952691045), ('nightmare', 0.026809442585803697)]
Topic 52:
[('order', 0.12808537202217138), ('cancel', 0.1056355402458776), ('cancelled', 0.08816652155833675), ('refund', 0.06317930029458749), ('cancel order', 0.06064213753804928), ('canceled', 0.04733993827455691), ('cancelled order', 0.04708312251370501), ('order cancelled', 0.03478731701269228), ('refunded', 0.027871549314468867), ('auto', 0.025578461854749594)]
Topic 53:
[('bank', 0.25097424693158105), ('bank log', 0.06619702073400643), ('bank drop', 0.06323528667005968), ('log', 0.05747191271800869), ('bank account', 0.05180288067730549), ('drop', 0.05152932337508871), ('logs', 0.050620524412714064), ('bank logs', 0.04511971186214804), ('account', 0.041499352402375615), ('bank logins', 0.033855349762364984)]
Topic 54:
[('onion', 0.32006057669572174), ('onion site', 0.06649607625940172), ('site', 0.04428860772985556), ('onion link', 0.04208690156388304), ('onion list', 0.031916646862297816), ('link', 0.031239331798188157), ('list', 0.028761313034509493), ('onion address', 0.02732765760686286), ('onions', 0.02694578747755256), ('onion service', 0.025703448035517374)]
Topic 55:
[('phishing', 0.23561044910532333), ('phishing link', 0.06313802156628703), ('phished', 0.06309006404198617), ('link', 0.0544273135365213), ('warning', 0.046834055380374974), ('phishing site', 0.042547653995183336), ('phishing warning', 0.0359857895652629), ('site', 0.027930499527085186), ('warning phishing', 0.02550649584946054), ('attempt', 0.02386768528099481)]
Topic 56:
[('apollon', 0.2610557915169623), ('apollon market', 0.15266804701967007), ('market', 0.05451694610184281), ('apollon apollon', 0.04000921043636116), ('mysteryland', 0.03430376947112439), ('market apollon', 0.02572590386075179), ('vendor apollon', 0.022431011183581366), ('jerry', 0.02222860417547563), ('apollo', 0.02130530184604832), ('tom jerry', 0.0200948479637061)]
Topic 57:
[('opsec', 0.35022729224865795), ('opsec opsec', 0.046836666139475104), ('opsec question', 0.038854537729898934), ('bad opsec', 0.024478757307401466), ('question', 0.023721106194186832), ('good opsec', 0.023158241948069137), ('opsec guide', 0.022347138495029003), ('guide', 0.02053973286259763), ('dnm', 0.018032696761829944), ('help opsec', 0.01731744867947547)]
Topic 58:
[('link', 0.2907868689372496), ('working link', 0.1425857891330286), ('working', 0.12626414963429772), ('pm', 0.10998816783875198), ('link please', 0.07544814928265496), ('pm link', 0.06343413695872846), ('link working', 0.05576602338283193), ('please', 0.05553799256006957), ('pm working', 0.04196619637285951), ('please pm', 0.04165737257090193)]
Topic 59:
[('mirror', 0.37636469017813645), ('working mirror', 0.11472362758530133), ('working', 0.10713557979686175), ('mirror link', 0.07119835688788886), ('empire mirror', 0.06148918029810922), ('mirror working', 0.05358983328686946), ('mirror please', 0.05138990153126206), ('mirrors', 0.05043749015234773), ('link', 0.048633708091083104), ('pm', 0.045039742736056855)]
Topic 60:
[('fentanyl', 0.24680043197686974), ('fent', 0.10237006383161038), ('carfentanil', 0.03273064608297345), ('selling fentanyl', 0.028024619668411806), ('analogue', 0.026872645234550612), ('fentanyl vendor', 0.02315329870273171), ('fentanyl analogue', 0.022766057057637824), ('admits', 0.021163030156912815), ('fentanyl distribution', 0.02064331974556389), ('distribution', 0.019829358659107895)]
Topic 61:
[('cgmc', 0.28724192332245013), ('invite', 0.24126623833526956), ('invite code', 0.13131152093750256), ('code', 0.10682393111274423), ('cgmc invite', 0.08719794948943649), ('code cgmc', 0.028725764339705678), ('invite cgmc', 0.025151130113143), ('cgmc cgmc', 0.021480773490797567), ('registration', 0.017751806113272), ('cgmc open', 0.017695285128428256)]
Topic 62:
[('alprazolam', 0.1774950431253047), ('powder', 0.0903159297884282), ('alprazolam powder', 0.0683344750163608), ('flualprazolam', 0.04816050372458763), ('etizolam', 0.03744089718480313), ('flubromazolam', 0.03348945359221391), ('mg', 0.0314066846132662), ('diclazepam', 0.031003866363976132), ('clonazolam', 0.029651945399140792), ('etizolam powder', 0.02546982532718339)]
Topic 63:
[('dmt', 0.3622943812322028), ('dmt vendor', 0.08894662727116771), ('dmt vape', 0.04150662526223707), ('odsmt', 0.03831447187988808), ('dmt dmt', 0.03553878477512212), ('changa', 0.032613763332761496), ('bluefairy', 0.031201379299106255), ('looking dmt', 0.029361329758132906), ('dmt changa', 0.027337509067753882), ('shimshai', 0.02647236711639086)]
Topic 64:
[('captcha', 0.2564820546479972), ('rapture', 0.20670440179803615), ('rapture market', 0.058817891966963326), ('captcha captcha', 0.026389804388985475), ('incorrect', 0.023606308505803694), ('login', 0.023258878731221535), ('dread captcha', 0.022538685458512522), ('link', 0.021828131299706968), ('main', 0.021274078161211345), ('main link', 0.020481763901360785)]
Topic 65:
[('chemical', 0.12788227993451434), ('research', 0.10544064542899824), ('research chemical', 0.07562371216638425), ('chems', 0.06505695446372571), ('research chemicals', 0.04572949815260401), ('chemicals', 0.045233832083363984), ('chem', 0.03560638093549809), ('chemgenie', 0.03429712361445301), ('chemical vendor', 0.0317058058936168), ('chemist', 0.02535261421452179)]
Topic 66:
[('tor', 0.28796745713292526), ('tor browser', 0.07657915395648766), ('browser', 0.07489476479942064), ('tor network', 0.03000705217019978), ('network', 0.02939376833861387), ('tor tor', 0.026505883480420277), ('tor project', 0.022309921193224436), ('research', 0.022046155749248675), ('vpn', 0.018954071236719917), ('project', 0.018945142159026828)]
Topic 67:
[('mephedrone', 0.15930437837531358), ('meopcp', 0.1259770106160847), ('mxe', 0.09446545586467191), ('mescaline', 0.07779051848670795), ('mmc', 0.05270377202214209), ('mephedrone mmc', 0.049619701685294317), ('amt', 0.04403369509350668), ('meodmt', 0.03627605355094202), ('mephedrone vendor', 0.03200220270040846), ('meow', 0.03132904607312401)]
Topic -1:
[('anyone', 0.009157486850034612), ('get', 0.005884162902552367), ('update', 0.005625579875985257), ('review', 0.005597399142755926), ('new', 0.005416412975120537), ('order', 0.005209978694101062), ('account', 0.005047053236402527), ('address', 0.004937169330719495), ('uk', 0.004731322461951784), ('free', 0.004693989597580415)]
In [138]:
topic_model.visualize_topics()

image.png

In [139]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.48274117708206177
Davies_bouldin_score: 0.8687026513069406
In [140]:
topic_words = topic_model.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics_ll:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in tc1.corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.5645685188607535
In [ ]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
In [189]:
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
Assigning labels to topics:   0%|          | 0/68 [00:00<?, ?it/s]
Assigning labels to topics: 100%|██████████| 68/68 [42:46<00:00, 37.74s/it] 
In [232]:
dict_zero_shots_25 = pd.read_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_150_20n/zero_shot_025.csv').set_index('Topic')['Labels'].to_dict()
In [ ]:
dict_zero_shots_25[1] = 'ask help - ask help post'
dict_zero_shots_25[2] = 'order'
dict_zero_shots_25[10] = 'crosspost vendor'
dict_zero_shots_25[13] = 'dream market - dread'
dict_zero_shots_25[14] = 'ask help - moderator'
dict_zero_shots_25[15] = 'cocaine vendor - cocaine'
dict_zero_shots_25[20] = 'bunk bar'
dict_zero_shots_25[28] = 'wsm vendor - wsm market'
dict_zero_shots_25[39] = 'cryptonia market - dread'
dict_zero_shots_25[41] = 'escrow service'
dict_zero_shots_25[48] = 'event happened'
dict_zero_shots_25[51] = 'dream - dream vendor - dread'
dict_zero_shots_25[52] = 'order cancelled'
dict_zero_shots_25[57] = 'opsec questions'
dict_zero_shots_25[58] = 'link'
dict_zero_shots_25[59] = 'mirror - mirror link - working mirror'
dict_zero_shots_25[64] = 'capcha'
In [ ]:
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_150_20n/zero_shot_025.csv', index=False)
In [235]:
topic_model.set_topic_labels(dict_zero_shots_25)
In [236]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True, custom_labels=True)

image.png

In [ ]:
topic_model.visualize_barchart(top_n_topics=70, custom_labels=True, n_words=10)

image.png

In [ ]:
topic_model.visualize_hierarchy(custom_labels=True)

67HierachicalTimeSeries_0.68_150_20n.png

In [5]:
topic_model.visualize_heatmap(custom_labels=True)

image.png

In [239]:
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on, 
                                                global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:22,  1.52s/it]

image.png

In [240]:
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]

results = pd.DataFrame({
    'Document': corpus_valid,
    'Embedding': embeddings_valid,
    'Topic': topics_valid,
    'Probability': probs_valid,
    'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
print(results_final.shape)
results_final.head()
(46629, 10)
Out[240]:
Document Embedding Topic Probability Created_on Count Name CustomName Representation Representative_Docs
0 naturalmeds apollon [-0.11246682, -0.03748099, 0.04490077, 0.01724... 56 [1.1964202652021368e-307, 6.589650224023648e-3... 2020-01-09 242 56_apollon_apollon market_market_apollon apollon market - apollon - apollon market [apollon, apollon market, market, apollon apol... [back apollon, apollon, apollon market]
1 redemption btc giveaway coke sub dread member ... [-0.043700494, -0.032600075, 0.0051953266, 0.0... 24 [0.006596099708564405, 0.003763170646085399, 0... 2020-01-09 544 24_coke_coke vendor_best coke_uk coke cocaine [coke, coke vendor, best coke, uk coke, uk, go... [fire coke, coke vendor , coke vendor]
2 flubromazolam sample giveaway [-0.04101017, 0.007629349, -0.07528322, -0.033... 62 [8.335712654832696e-308, 7.430095287264378e-30... 2019-11-06 290 62_alprazolam_powder_alprazolam powder_flualpr... powder - alprazolam - alprazolam powder [alprazolam, powder, alprazolam powder, flualp... [usa domestic alprazolam powder mxe apvp inbom...
3 cigarette tobacco replica [-0.07527819, 0.13146353, -0.07912154, -0.0353... 0 [0.040079176553298505, 0.007976173889767435, 0... 2020-01-09 5021 0_weed_cannabis_cart_thc marijuana [weed, cannabis, cart, thc, review, hash, shat... [product vendor review ml lemon kush wax vape ...
4 requiring image image review [-0.014864997, 0.08211257, 0.004136639, 0.0027... 16 [0.006753134223967767, 0.007309909359113744, 0... 2019-11-06 998 16_review_vendor review_review vendor_vendor reviews vendor [review, vendor review, review vendor, vendor,... [xpost danknation vendor review sunaero multis...
In [241]:
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_150_20n.parquet')
In [ ]:
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_150_20n", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
In [243]:
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_150_20n_safetensors", serialization="safetensors", save_ctfidf=True, save_embedding_model=model)
In [69]:
topic_model = BERTopic.load("Models/topic_model_all-MiniLM-L6-v2_150_20n")
In [244]:
sentence = ['recently closed Samsara market']
tp, pr = topic_model.transform(sentence)
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
2024-06-30 21:54:03,236 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-06-30 21:54:08,543 - BERTopic - Dimensionality - Completed ✓
2024-06-30 21:54:08,544 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-06-30 21:54:08,576 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2024-06-30 21:54:08,699 - BERTopic - Probabilities - Completed ✓
2024-06-30 21:54:08,701 - BERTopic - Cluster - Completed ✓
In [248]:
top_indices = np.argsort(pr[0])[::-1][:5]
top_topics = [(topic_model.get_topic(i), pr[0][i], topic_model.custom_labels_[i+1]) for i in top_indices]
df_finals = pd.DataFrame(top_topics, columns=['Topic', 'Probability', 'Label'])
df_finals['Words'] = df_finals['Topic'].apply(lambda topic: [word for word, prob in topic])
df_finals['Sentence'] = sentence * len(df_finals)
df_finals
Out[248]:
Topic Probability Label Words Sentence
0 [(cryptonia, 0.270593329016721), (cryptonia ma... 0.002991 cryptonia market - dread [cryptonia, cryptonia market, cryptonia crypto... recently closed Samsara market
1 [(empire, 0.1408905571475779), (empire market,... 0.002262 empire market [empire, empire market, empire empire, market,... recently closed Samsara market
2 [(wallstreet, 0.17075397467717646), (wall, 0.1... 0.002127 wallstreet [wallstreet, wall, wall street, street, wall s... recently closed Samsara market
3 [(scammer, 0.09029491748260987), (scam, 0.0879... 0.002075 fraud - scammer [scammer, scam, exit, scamming, scammed, exit ... recently closed Samsara market
4 [(crosspost, 0.14470709618860572), (giveaway, ... 0.002045 crosspost vendor [crosspost, giveaway, review crosspost, crossp... recently closed Samsara market
In [109]:
topic_model.get_topic(tp[0])
Out[109]:
[('anyone', 0.009415205712082564),
 ('update', 0.007916840604830654),
 ('address', 0.006939392479966835),
 ('new', 0.006268540212828576),
 ('get', 0.006179372827051399),
 ('vacation', 0.0059803996923821),
 ('has', 0.0058347636749867746),
 ('need', 0.005401699096715211),
 ('drop', 0.005394598005495695),
 ('listing', 0.005367223382048893)]

OTHER - No Preprocessing¶

In [110]:
df = pd.read_csv('../merged_data.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
Out[110]:
68931
In [111]:
df.head()
Out[111]:
name_board creator_thread creator_id_thread name_thread created_on registration_date total_posts reputation creator_post creator_id_post content created_on_post
0 Tor blonger 32544 Tor shrinked of about 13,5% but you can contri... 2020-01-09 1900-01-01 00:00:00 0.0 0.0 [deleted] 37 [removed] 2019-10-16
2 Tor Xanitforthecash 2467 should I run a tor node 2020-01-09 2018-03-21 00:00:00 88.0 9.0 rswz 32661 ***LINK***http://ea5faa5po25cf7fb.onion/projec... 2019-10-16
7 Tor Syndicate 33036 Tor Browser 9.0! 2019-11-06 1900-01-01 00:00:00 0.0 0.0 Syndicate 33036 Tor Browser 9.0 is now available from the Tor ... 2019-10-30
9 Tor Syndicate 33036 [UPDATED] Tor Security Guide Crosspost 2019-11-06 1900-01-01 00:00:00 0.0 0.0 Syndicate 33036 ***LINK***[UPDATED] Tor Security Guide[/post/9... 2019-10-30
10 Tor wekhiu48 1178 /u/CostcoRotisserieChicken spreading FUD, Let'... 2020-01-09 2019-05-24 00:00:00 1355.0 342.0 wekhiu48 1178 ***LINK***/u/CostcoRotisserieChicken spreading... 2019-10-30
In [112]:
model = SentenceTransformer('all-MiniLM-L6-v2')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
2024-07-01 16:22:21,480 - PreProcessingText - INFO - Encoding the corpus. This might take a while.
Batches: 100%|██████████| 1065/1065 [10:23<00:00,  1.71it/s]
Out[112]:
array([[-2.8301043e-02,  1.2166312e-02,  8.7255865e-02, ...,
        -8.3833829e-02, -9.1334045e-02, -5.1197205e-02],
       [-4.2577364e-02,  2.9390754e-02,  4.8442027e-03, ...,
        -1.0984470e-01, -1.3457792e-02,  6.7995749e-02],
       [ 2.0266762e-02,  4.9410637e-02, -9.2650507e-06, ...,
        -6.9691852e-02,  5.9254151e-03, -2.0927912e-02],
       ...,
       [-5.2471079e-02, -7.9523809e-02, -2.1687793e-02, ...,
        -1.5866488e-02, -5.3970262e-02,  2.4720646e-02],
       [-3.0821422e-02, -3.0247206e-02, -1.0898691e-02, ...,
        -4.8142254e-02,  9.6911322e-03, -6.1955258e-02],
       [ 2.7433981e-03, -8.2745239e-02, -3.2834979e-03, ...,
        -4.6796635e-02,  6.8392135e-02, -3.5423629e-02]], dtype=float32)
In [150]:
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
pos_model = PartOfSpeech("en_core_web_sm")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=20, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(
    top_n_words=10,
    n_gram_range=(1, 2),
    umap_model=umap_model, 
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model, 
    ctfidf_model=ctfidf_model, 
    representation_model={
        "KeyBERT": kw,
        "MMR": mmr,
        "POS": pos_model
    },
    embedding_model=model,
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
2024-07-01 17:56:59,017 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-01 17:59:01,118 - BERTopic - Dimensionality - Completed ✓
2024-07-01 17:59:01,139 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-01 17:59:52,141 - BERTopic - Cluster - Completed ✓
2024-07-01 17:59:52,287 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-07-01 18:00:27,018 - BERTopic - Representation - Completed ✓
In [151]:
topic_model.get_topic_info()
Out[151]:
Topic Count Name Representation KeyBERT MMR POS Representative_Docs
0 -1 24206 -1_etizolam_fullz_berlusconi_cdnven [etizolam, fullz, berlusconi, cdnven, pin, dro... [customs, ukbk, cdnven, heinekenexpress, produ... [etizolam, fullz, berlusconi, cdnven, pin, dro... [fullz, berlusconi, cdnven, drop, tochka, bond... [pill presses - chemicals - amphetamine salts ...
1 0 5970 0_weed_carts_thc_cannabis [weed, carts, thc, cannabis, distillate, shatt... [bulk weed, cannabis, uk weed, edibles, weed v... [weed, carts, thc, cannabis, distillate, shatt... [weed, carts, thc, cannabis, distillate, shatt... [1500 mg thc gummies - 60 pieces x 25mg each -...
2 1 2675 1_sub_noob_hello_subs [sub, noob, hello, subs, guys, thank, communit... [mentor, guides, newbie, happy new, guide, com... [sub, noob, hello, subs, guys, thank, communit... [sub, noob, subs, guys, community, help, notif... [i dont know what to do my life sucks now all ...
3 2 1919 2_empire_empire empire_empire market_market em... [empire, empire empire, empire market, market ... [empire market, empire markets, market empire,... [empire, empire empire, empire market, market ... [empire, mods, link, market, dispute, account,... [empire market is back!, zaguble is now on thi...
4 3 1780 3_dream_dream market_nightmare_dream dream [dream, dream market, nightmare, dream dream, ... [dream market, market dream, nightmare market,... [dream, dream market, nightmare, dream dream, ... [dream, nightmare, nightmare market, dreammark... [why is dream market still up?, dream market i...
... ... ... ... ... ... ... ... ...
69 68 173 68_chemicals_research chemicals_research_chems [chemicals, research chemicals, research, chem... [research chemicals, research chemical, resear... [chemicals, research chemicals, research, chem... [chemicals, research, chems, chemical, chemist... [research chemicals?, research chemicals, chem...
70 69 162 69_xanmasterfrank_xans_xan_xmf [xanmasterfrank, xans, xan, xmf, xansalad, xan... [mailman xans, xan vendor, new xan, xans, xan,... [xanmasterfrank, xans, xan, xmf, xansalad, xan... [xans, xansalad, game, demand, savior, rolling... [[notice] c10labs & mailman-xans vendor update...
71 70 159 70_2c_2cb_domestic 2c_2cb vendors [2c, 2cb, domestic 2c, 2cb vendors, cb, 2cb ve... [2c available, 2c vendors, 2c, looking 2c, 2c ... [2c, 2cb, domestic 2c, 2cb vendors, cb, 2cb ve... [2cb, 2cb vendor, good 2cb, domestic, ausorgan... [naghb!!! new batch nice and white, super dry!...
72 71 156 71_reviews_reviewer_reviews review_review theb... [reviews, reviewer, reviews review, review the... [reviews, reviews review, reviews trusted, rev... [reviews, reviewer, reviews review, review the... [reviews, reviewer, thebotanist, rcexpress, ho... [need good reviews by trusted people, no vouch...
73 72 150 72_samples_free samples_samples new_samples free [samples, free samples, samples new, samples f... [free samples, samples free, samples uk, free ... [samples, free samples, samples new, samples f... [samples, free samples, free, kwayuk, amphetam... [new vendor - free samples :), free samples ||...

74 rows × 8 columns

In [152]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
In [153]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6635172367095947
Davies_bouldin_score: 0.41709404604850947
In [154]:
topic_words = topic_model.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics_ll:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in tc1.corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.4770898279015756
In [155]:
topic_model.visualize_topics()
In [162]:
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.43)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-07-01 18:19:47,256 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
Out[162]:
Topic Count Name Representation KeyBERT MMR POS Representative_Docs
0 -1 16327 -1_to_the_is_for [to, the, is, for, anyone, with, how, and, in,... [customs, ukbk, cdnven, heinekenexpress, produ... [etizolam, fullz, berlusconi, cdnven, pin, dro... [fullz, berlusconi, cdnven, drop, tochka, bond... [pill presses - chemicals - amphetamine salts ...
1 0 5984 0_weed_carts_thc_cannabis [weed, carts, thc, cannabis, review, distillat... [bulk weed, cannabis, uk weed, edibles, weed v... [weed, carts, thc, cannabis, distillate, shatt... [weed, carts, thc, cannabis, distillate, shatt... [1500 mg thc gummies - 60 pieces x 25mg each -...
2 1 2680 1_this_help_you_the [this, help, you, the, what, to, need, we, me,... [mentor, guides, newbie, happy new, guide, com... [sub, noob, hello, subs, guys, thank, communit... [sub, noob, subs, guys, community, help, notif... [i dont know what to do my life sucks now all ...
3 2 1995 2_empire_on empire_empire market_on [empire, on empire, empire market, on, empire ... [empire market, empire markets, market empire,... [empire, empire empire, empire market, market ... [empire, mods, link, market, dispute, account,... [empire market is back!, zaguble is now on thi...
4 3 1851 3_dream_nightmare_on dream_dream market [dream, nightmare, on dream, dream market, mar... [dream market, market dream, nightmare market,... [dream, dream market, nightmare, dream dream, ... [dream, nightmare, nightmare market, dreammark... [why is dream market still up?, dream market i...
... ... ... ... ... ... ... ... ...
69 68 196 68_chemicals_research_research chemicals_chems [chemicals, research, research chemicals, chem... [research chemicals, research chemical, resear... [chemicals, research chemicals, research, chem... [chemicals, research, chems, chemical, chemist... [research chemicals?, research chemicals, chem...
70 69 274 69_xtc_xmf_xans_xan [xtc, xmf, xans, xan, xanmasterfrank, xanos, x... [mailman xans, xan vendor, new xan, xans, xan,... [xanmasterfrank, xans, xan, xmf, xansalad, xan... [xans, xansalad, game, demand, savior, rolling... [[notice] c10labs & mailman-xans vendor update...
71 70 201 70_2c_2cb_cb_2cb vendors [2c, 2cb, cb, 2cb vendors, for 2c, for 2cb, 2c... [2c available, 2c vendors, 2c, looking 2c, 2c ... [2c, 2cb, domestic 2c, 2cb vendors, cb, 2cb ve... [2cb, 2cb vendor, good 2cb, domestic, ausorgan... [naghb!!! new batch nice and white, super dry!...
72 71 453 71_review_reviews_review review_feedback [review, reviews, review review, feedback, kar... [reviews, reviews review, reviews trusted, rev... [reviews, reviewer, reviews review, review the... [reviews, reviewer, thebotanist, rcexpress, ho... [need good reviews by trusted people, no vouch...
73 72 238 72_samples_free samples_free_sample [samples, free samples, free, sample, samples ... [free samples, samples free, samples uk, free ... [samples, free samples, samples new, samples f... [samples, free samples, free, kwayuk, amphetam... [new vendor - free samples :), free samples ||...

74 rows × 8 columns

In [163]:
for topic_id in set(topics):
    print(f"Topic {topic_id}:")
    print(topic_model.get_topic(topic_id))
Topic 0:
[('weed', 0.026121802166754134), ('carts', 0.017488248895024136), ('thc', 0.016026211658409277), ('cannabis', 0.01577853469147459), ('review', 0.013686158893156207), ('distillate', 0.010819478752233087), ('hash', 0.010250132075366852), ('shatter', 0.010090807113538146), ('indoor', 0.009561748524550818), ('vape', 0.009464869665684333)]
Topic 1:
[('this', 0.022156612826095497), ('help', 0.01632414770533675), ('you', 0.014978321013280858), ('the', 0.012811440985770573), ('what', 0.012388997306343949), ('to', 0.012100264616063165), ('need', 0.01109007497215625), ('we', 0.0107600044873439), ('me', 0.009563915734872003), ('here', 0.009511330630291633)]
Topic 2:
[('empire', 0.10770406850965517), ('on empire', 0.04816961138770311), ('empire market', 0.0328421796934902), ('on', 0.022504719462360295), ('empire empire', 0.020009257170742853), ('market', 0.017631311042885157), ('empire is', 0.015589458155944999), ('is', 0.013414536942523092), ('to empire', 0.011324614016653523), ('is empire', 0.010859945457812397)]
Topic 3:
[('dream', 0.09641002602673354), ('nightmare', 0.04913856746609453), ('on dream', 0.047042687321489224), ('dream market', 0.03765107394642499), ('market', 0.028699181277440124), ('on', 0.025210226979058968), ('nightmare market', 0.018702413637631816), ('on nightmare', 0.01868877995580611), ('from dream', 0.018357467692169397), ('dream dream', 0.013163321597797439)]
Topic 4:
[('vendor', 0.052700029031768666), ('vendors', 0.04324652820637855), ('for', 0.01393399811834629), ('new vendor', 0.012932303425000725), ('looking', 0.012816596770230374), ('new', 0.012787313927470147), ('inquiry', 0.012680585032289849), ('vendor inquiry', 0.012597115343610051), ('looking for', 0.012564980334643528), ('for vendor', 0.011565740630903716)]
Topic 5:
[('scammer', 0.06557991898322732), ('scam', 0.062423182493869016), ('scamming', 0.0416026763582103), ('exit', 0.03962120344428672), ('scammed', 0.033278335068797134), ('legit', 0.026933766650158217), ('exit scam', 0.024854726513092378), ('is', 0.024516339837274866), ('is scammer', 0.022010863309222133), ('selective', 0.020247491591059354)]
Topic 6:
[('mdma', 0.12356226179434289), ('mda', 0.033837876024986675), ('mdma vendor', 0.021887930572066675), ('us', 0.01984273171377282), ('domestic', 0.015445618836374264), ('only', 0.014518200241130881), ('mdma review', 0.01350972889234908), ('usa', 0.013353540656566994), ('best mdma', 0.013039974920400056), ('for mdma', 0.01299316496196008)]
Topic 7:
[('darknet', 0.05747348912936089), ('dark', 0.05109485353135456), ('dark web', 0.03548081747093856), ('web', 0.0349281975945472), ('sentenced', 0.021720947221901443), ('the darknet', 0.017948854650965712), ('sentenced to', 0.017705595191481), ('drug', 0.017108539242676322), ('darkweb', 0.016965668888090583), ('the dark', 0.01635221966708456)]
Topic 8:
[('xanax', 0.13265126906538416), ('bars', 0.03233130571500381), ('xanax bars', 0.0275726855493186), ('2mg', 0.022540800070776374), ('xanax vendor', 0.018903558996081354), ('3mg', 0.0171280849541117), ('best xanax', 0.01274858148476675), ('xanaxdaddy', 0.012584567449035392), ('review', 0.012191854584679848), ('xanax and', 0.012161589874413053)]
Topic 9:
[('lsd', 0.1500541317544297), ('tabs', 0.03700257669240534), ('lsd vendor', 0.028352613109866642), ('gammagoblin', 0.01654793640468815), ('lsd tabs', 0.016497458693354932), ('free', 0.01600863216095035), ('usa', 0.015898146825191358), ('mdma', 0.014025313395207913), ('lsd and', 0.013043958505000241), ('lsd review', 0.012766233200526075)]
Topic 10:
[('carding', 0.1110495291929295), ('card', 0.06721751572964317), ('cards', 0.06601786278324696), ('credit', 0.029322000678320657), ('gift', 0.02302338008431706), ('debit', 0.019371118488606325), ('credit card', 0.01821021474864727), ('carder', 0.01626656387439844), ('carded', 0.01619506300655433), ('prepaid', 0.01577363769638637)]
Topic 11:
[('pgp', 0.13587212423768744), ('key', 0.0798850048816273), ('pgp key', 0.06858432902502162), ('2fa', 0.05618980658572741), ('public', 0.020958928563643633), ('keys', 0.019911531140275458), ('public pgp', 0.018319258469666012), ('encrypt', 0.018078668999193033), ('my', 0.015908248312017165), ('encryption', 0.015567742749305113)]
Topic 12:
[('market', 0.08551283440316028), ('markets', 0.045749059635866764), ('the market', 0.019094970124643295), ('this market', 0.017914991116499124), ('new market', 0.017627390353052007), ('market is', 0.017152331153567645), ('marketplace', 0.016173780267312046), ('the', 0.016093209921969998), ('what', 0.015302905089668325), ('is', 0.013757711975997023)]
Topic 13:
[('shipping', 0.06707952945625859), ('delivery', 0.03739198906886787), ('package', 0.0335807962526959), ('shipped', 0.021681308295394053), ('usps', 0.01928404468808869), ('to', 0.017620772243724696), ('international', 0.017275062354480578), ('delivered', 0.01699843444922411), ('shipping to', 0.016221752036348076), ('ship', 0.014560830277983484)]
Topic 14:
[('deposit', 0.1264517942094552), ('deposits', 0.03923223907665865), ('btc', 0.03729918660678976), ('deposited', 0.035671739831721616), ('address', 0.035130090808307056), ('ticket', 0.029276657832733487), ('double', 0.02676244272911281), ('double deposit', 0.024892547344915906), ('missing', 0.024171735143637723), ('not', 0.02167293853453858)]
Topic 15:
[('cocaine', 0.16558700571468687), ('fishscale', 0.025497383031868166), ('peruvian', 0.024666728874358633), ('cocaine vendor', 0.023603150801550694), ('fishscale cocaine', 0.021690075192145053), ('uncut', 0.021180531857615013), ('90', 0.019718351568327092), ('colombian', 0.01883387407701839), ('pure', 0.018772086249928766), ('ukwhite', 0.018187008764506282)]
Topic 16:
[('dispute', 0.18107255614430504), ('disputes', 0.03870186099223393), ('dispute dispute', 0.027198209942153568), ('please', 0.02385847984002258), ('mod', 0.020718806954577403), ('days', 0.019586736957386802), ('resolved', 0.019556720533625234), ('help', 0.01836971378875693), ('no', 0.018175031416175165), ('my dispute', 0.017926092916419396)]
Topic 17:
[('job', 0.053617966900222684), ('lfw', 0.049012324715556436), ('hacking', 0.032951530510461076), ('hacked', 0.03104745784243176), ('hacker', 0.030683441109691355), ('email', 0.02559051646261674), ('malware', 0.017334036909751067), ('btc', 0.016687285255972538), ('for', 0.015662969796092323), ('lfw btc', 0.014489022479059529)]
Topic 18:
[('monero', 0.0840300999786981), ('bitcoin', 0.054101446350999234), ('btc', 0.04376407780260462), ('wallet', 0.02102890172499106), ('buy', 0.020963394878476746), ('crypto', 0.02012981815428904), ('buying', 0.020066985939152047), ('to', 0.016582297731664773), ('to buy', 0.016291413951388542), ('coins', 0.015919374208244234)]
Topic 19:
[('id', 0.10997763982427408), ('fake', 0.07081625600743174), ('fake id', 0.06032720035661417), ('license', 0.04806559029900924), ('passport', 0.04803835906156708), ('passports', 0.03248988196610057), ('scans', 0.0316159900336969), ('drivers', 0.030886177518274296), ('ids', 0.029724785628560103), ('drivers license', 0.02478697220409289)]
Topic 20:
[('anyone', 0.07403115183970083), ('happened', 0.04158765883819754), ('has anyone', 0.040647945759817825), ('happened to', 0.04036933800330011), ('what happened', 0.03677703377857992), ('heard', 0.03664614180565851), ('tried', 0.03569834479856244), ('has', 0.03525422736021841), ('anyone tried', 0.03432681011844722), ('anyone heard', 0.030885389950925424)]
Topic 21:
[('oxycodone', 0.06628105916641315), ('oxy', 0.050914576261560554), ('opiateconnect', 0.029309391867125948), ('80mg', 0.025514415664952925), ('opiates', 0.024522116459384753), ('morphine', 0.023407474802139994), ('dilaudid', 0.02230935589510508), ('10mg', 0.022182979515936393), ('oxycontin', 0.020267064194940176), ('40mg', 0.018017473812191667)]
Topic 22:
[('meth', 0.1574163112434691), ('amphetamine', 0.049300409593825305), ('crystal meth', 0.038976415801772385), ('crystal', 0.036505916304162125), ('meth vendor', 0.02617692499953047), ('methamphetamine', 0.025396642831782397), ('best meth', 0.018245108818923387), ('meth vendors', 0.017370472966145134), ('speed', 0.015739908907543582), ('meth and', 0.014542599402273156)]
Topic 23:
[('counterfeit', 0.12871490570155777), ('notes', 0.06547435507955257), ('euro', 0.0491725502130328), ('money', 0.03141880581362891), ('fake', 0.029951479845183634), ('gbp', 0.029799843552008314), ('counterfeits', 0.029192110618025423), ('counterfeit money', 0.027394824167863107), ('bills', 0.026267525876960095), ('counterfeit euro', 0.0236297292596767)]
Topic 24:
[('dread', 0.15698287206715708), ('on dread', 0.0327391257020154), ('hugbunter', 0.03200970784564761), ('dread is', 0.029548523962026743), ('cafe dread', 0.022703209273806108), ('cafe', 0.022384956404216023), ('word of', 0.021845632483744725), ('dread word', 0.021620621848201585), ('the day', 0.02132354776547619), ('word', 0.01876864260879318)]
Topic 25:
[('ticket', 0.16201131673866934), ('support ticket', 0.09713196568034925), ('support', 0.09643044005038842), ('tickets', 0.041286863293046976), ('please', 0.034380750920094336), ('help', 0.024471719016428438), ('response', 0.01967290184393878), ('se7en', 0.019158710405234706), ('please help', 0.018414086886693087), ('support tickets', 0.017575439859763984)]
Topic 26:
[('onion', 0.11803562686306104), ('tor', 0.0984618899334574), ('browser', 0.025339074667152597), ('tor browser', 0.024049148739020237), ('onion site', 0.021392123557307526), ('onions', 0.018010212171229376), ('on tor', 0.01790931186994719), ('site', 0.017102240366678303), ('v3', 0.012676813959885938), ('the onion', 0.01246732698095282)]
Topic 27:
[('tails', 0.183531729136247), ('electrum', 0.08784236176016401), ('on tails', 0.0697525949099992), ('wallet', 0.03877641166224935), ('whonix', 0.028542381893042725), ('monero', 0.028057319113386218), ('usb', 0.024439255061112542), ('on', 0.022339449578295967), ('tails electrum', 0.020823069785503738), ('electrum on', 0.020823069785503738)]
Topic 28:
[('coke', 0.19926153013926018), ('coke vendor', 0.03682358431435203), ('best coke', 0.0356582596743699), ('coke vendors', 0.026343335892151952), ('coke from', 0.023791402489163786), ('best', 0.02177867133690634), ('uk coke', 0.02116613307728994), ('coke in', 0.019791918945312438), ('good coke', 0.019662398560559707), ('brazil', 0.01825329276693766)]
Topic 29:
[('alprazolam', 0.07291878421114571), ('diazepam', 0.05477739270191512), ('powder', 0.044584877773492924), ('valium', 0.03850174408224983), ('etizolam', 0.03747570252373598), ('clonazolam', 0.03283601295203318), ('alprazolam powder', 0.02609385034775606), ('10mg', 0.024261819484092423), ('diclazepam', 0.018839623367253085), ('2mg', 0.017417134216026774)]
Topic 30:
[('ketamine', 0.23138118469385505), ('ketamine vendor', 0.03281186428650618), ('ketamine review', 0.025667538351183156), ('isomer', 0.021074224401587748), ('review', 0.01943650911383348), ('domestic ketamine', 0.018773573535844048), ('ketamine vendors', 0.018773573535844048), ('racemic', 0.01767338064151054), ('usa', 0.017429407965742466), ('best ketamine', 0.016880092192053206)]
Topic 31:
[('wsm', 0.18893118468595335), ('on wsm', 0.07838443324529455), ('on', 0.024366535891183275), ('wsm wsm', 0.022356672533994926), ('to wsm', 0.020997391001001165), ('wsm vendor', 0.020361364408962745), ('from wsm', 0.015549309090392706), ('of wsm', 0.014579701648874278), ('wsm is', 0.013247998435407521), ('vendor', 0.01278812957643258)]
Topic 32:
[('refund', 0.075497103967865), ('order', 0.06830056073098584), ('cancel', 0.05861589822091079), ('cancelled', 0.04863445727152318), ('auto', 0.03220391958726608), ('an order', 0.03182625667963673), ('canceled', 0.026137884406081138), ('orders', 0.02526335256672877), ('to cancel', 0.022855288976379076), ('an', 0.0225526675615654)]
Topic 33:
[('mushrooms', 0.10787088587575279), ('shrooms', 0.0621613105343965), ('magic', 0.05571050873232468), ('mushroom', 0.054955686308678885), ('cubensis', 0.04685554721548522), ('magic mushrooms', 0.044651482918634515), ('psychedelics', 0.03945945002111887), ('psilocybin', 0.028563844512759645), ('psilocybe', 0.027421290732249262), ('psilocybe cubensis', 0.024488605716793014)]
Topic 34:
[('order', 0.10292926971740386), ('orders', 0.06400314292162491), ('ordering', 0.028216852218753558), ('ordered', 0.02593596483341188), ('from', 0.025664534449295392), ('order from', 0.025506010837155306), ('an order', 0.021006475473566995), ('ordered from', 0.019380795040906413), ('an', 0.014858006837862698), ('my order', 0.014823145437907932)]
Topic 35:
[('adderall', 0.11427842206017935), ('30mg', 0.03655466173023898), ('pharmacy', 0.03032135856783765), ('ir', 0.028534232288490995), ('ritalin', 0.0273691072254764), ('vyvanse', 0.022935988533964613), ('20mg', 0.021641923329480183), ('brand name', 0.018684967029720622), ('30mg adderall', 0.017426452023171744), ('adderallz', 0.01686291747710415)]
Topic 36:
[('ddos', 0.17414239015277766), ('captcha', 0.07577548089460238), ('ddos attacks', 0.04302444055596632), ('attacks', 0.042004900462510805), ('the ddos', 0.038617577187419594), ('ddos attack', 0.03717492195361204), ('attack', 0.03645706154154813), ('under', 0.02007774942483335), ('the', 0.019890898912752225), ('under ddos', 0.01988571139017261)]
Topic 37:
[('xmr', 0.21138804641985695), ('xmr to', 0.05750289424464451), ('btc', 0.04131037525187534), ('to xmr', 0.03764193405073356), ('morphscript', 0.034505106213172425), ('to', 0.02626411666278744), ('xmr deposit', 0.023019130278705533), ('btc xmr', 0.02071853544066472), ('btc to', 0.020386561961046594), ('xmr xmr', 0.02006179072475952)]
Topic 38:
[('account', 0.08157920319621015), ('login', 0.057614194350978365), ('password', 0.04449622725318416), ('locked', 0.03976770548133053), ('log', 0.03247005948649909), ('locked out', 0.03165830799933661), ('my account', 0.030429683045385705), ('log in', 0.02689883375014806), ('recovery', 0.024509352417925918), ('my', 0.022002421237649776)]
Topic 39:
[('de', 0.04779639905862267), ('på', 0.03372102638336295), ('har', 0.024320398161032687), ('en', 0.024016587519234082), ('noen', 0.02069338999531763), ('er', 0.020648371518196852), ('som', 0.019101590764908583), ('fra', 0.018095669323824955), ('zion', 0.01765646467230412), ('para', 0.016092210850804776)]
Topic 40:
[('heroin', 0.19281429808000025), ('afghan', 0.029139972777356787), ('afghan heroin', 0.02446449007816335), ('heroin vendor', 0.022852789333319346), ('synthetic heroin', 0.022109315479878364), ('crack', 0.019116982789578758), ('cocaine', 0.018328855127266994), ('opium', 0.018041525538213487), ('synthetic', 0.017907448638768184), ('best heroin', 0.017659106201776603)]
Topic 41:
[('paypal', 0.141445389455225), ('bank', 0.09380445473852679), ('drop', 0.06380071112088324), ('bank drop', 0.061129883786000656), ('drops', 0.05659308611795184), ('transfers', 0.049744815859334615), ('bank drops', 0.04874961855138504), ('transfer', 0.042918339848051365), ('paypal transfers', 0.030022841387120944), ('cashout', 0.027298016669983445)]
Topic 42:
[('cc', 0.16431657826411347), ('cvv', 0.09433825676780017), ('vbv', 0.04616551317698169), ('non vbv', 0.028762115432700422), ('cc cvv', 0.027131741330170718), ('non', 0.025808529202315265), ('ccv', 0.025703754944372263), ('ccs', 0.025439312020644134), ('fullz', 0.02007433918779596), ('uk cc', 0.0198102395980812)]
Topic 43:
[('withdraw', 0.12289322331297811), ('withdrawal', 0.0785577127887274), ('withdrawals', 0.03186187266623986), ('to withdraw', 0.030267943437358953), ('withdrawl', 0.027243329733033055), ('pin', 0.02719511037435446), ('withdraw pin', 0.021940158833065233), ('btc', 0.02177432580984283), ('withdraws', 0.02057668981262508), ('working', 0.020044844625358132)]
Topic 44:
[('pills', 0.11126813852504376), ('xtc', 0.06292529793177554), ('xtc pills', 0.0571375463172878), ('pill', 0.05346705854762878), ('ecstasy', 0.0472681743528566), ('pill press', 0.023104528145599042), ('pillchills', 0.021957955442254703), ('ecstasy pills', 0.021700716243544197), ('press', 0.01981535592625848), ('xtc pill', 0.017885222360582386)]
Topic 45:
[('bars', 0.1613690467827236), ('bar', 0.0370835629564199), ('bunk', 0.03659380570861019), ('bunk bars', 0.019897671932218714), ('thebartender', 0.018810298991064574), ('alp', 0.017990425471111064), ('oc bars', 0.017794316603773463), ('oc', 0.017505628508061418), ('bars from', 0.017055147370473183), ('budgetbars', 0.015219071981035545)]
Topic 46:
[('cryptonia', 0.21123299187609967), ('on cryptonia', 0.09273819671628139), ('cryptonia market', 0.036141770391833056), ('on', 0.028423540857280643), ('cryptonia cryptonia', 0.027613651302351248), ('to cryptonia', 0.02293353518301816), ('now', 0.019288496099719173), ('at cryptonia', 0.01582069248026594), ('market', 0.015424512621159242), ('cryptonia is', 0.01392815556061474)]
Topic 47:
[('sale', 0.0669009177130793), ('promo', 0.030808587495418135), ('deals', 0.02013154966324256), ('products', 0.01809389540808409), ('price', 0.017055175637728157), ('20', 0.015416471561826933), ('sales', 0.014123334379440562), ('all', 0.013893851454684978), ('on all', 0.013871032069391573), ('prices', 0.013592431044111595)]
Topic 48:
[('crosspost', 0.16270972230216432), ('review crosspost', 0.025019188870844936), ('crosspost vendor', 0.01838207597271105), ('review', 0.016720105714953134), ('giveaway', 0.013660190788554766), ('crosspost review', 0.011645548578536474), ('2019 crosspost', 0.009958124639188928), ('vendor', 0.009949489051313667), ('giveaway crosspost', 0.009320042620928823), ('for', 0.0077798102009458215)]
Topic 49:
[('pack', 0.1364860666744876), ('packs', 0.11044224137068849), ('oc', 0.0421449573216045), ('landed', 0.039272308885937565), ('pack from', 0.033269023573027565), ('pack landed', 0.028504085141223018), ('packs from', 0.027278093710935654), ('landing', 0.0250916486947475), ('land', 0.024262921839508213), ('from', 0.02161306341104762)]
Topic 50:
[('benzo', 0.11057106932609045), ('benzos', 0.09263476300592406), ('rc', 0.0333035930349936), ('benzobananas', 0.029710130154272065), ('rc benzos', 0.025421318436476232), ('benzos4u', 0.02022653587516694), ('rc benzo', 0.01946620240831312), ('benzo vendor', 0.01912087633670443), ('benzoboys', 0.018962377382969003), ('liquid', 0.01797363735790245)]
Topic 51:
[('links', 0.14985119813634712), ('link', 0.14460198096056903), ('working', 0.11897238135796465), ('pm', 0.08294773649970093), ('working links', 0.07547889923403933), ('working link', 0.06837483843911635), ('pm me', 0.058270123126603415), ('me', 0.04673518044631064), ('please', 0.04145364644774408), ('link please', 0.03342161836140401)]
Topic 52:
[('vendor review', 0.13176904076454998), ('review', 0.10508725381281081), ('vendor', 0.07385331458017709), ('review vendor', 0.04329079272179012), ('reviews', 0.02661195949299305), ('feedback', 0.022680277962396123), ('vendor reviews', 0.01907132995225127), ('review for', 0.014085077763699176), ('review template', 0.012897663376432437), ('template', 0.011980985758793836)]
Topic 53:
[('dnm', 0.12825676892525106), ('dnms', 0.030088111144422336), ('dnstars', 0.024418674148458663), ('dn', 0.024162654979166785), ('the', 0.019843338747650742), ('the dnm', 0.0196448944940482), ('dnmuk', 0.01612662462301172), ('dm', 0.014714059947660639), ('of dnm', 0.013986898892251154), ('on dnm', 0.013003720302421853)]
Topic 54:
[('drugs', 0.1340007380450139), ('drug', 0.07777143413865256), ('drugsuk', 0.021498056085625307), ('drugs in', 0.020534253918424974), ('drugs are', 0.014828844915434722), ('to', 0.013938225834708464), ('the', 0.013475625258109669), ('how', 0.013044053233839281), ('selling drugs', 0.012941161757198665), ('of drugs', 0.012822992341620756)]
Topic 55:
[('speed', 0.183950814430372), ('speed paste', 0.06463035435808222), ('paste', 0.06440461831035135), ('speedbuster', 0.05917315935237283), ('mph', 0.05275738646237156), ('4f', 0.05275738646237156), ('4f mph', 0.05275738646237156), ('vendor speedbuster', 0.027511156012562783), ('vendorshop', 0.025218874791376196), ('speed vendor', 0.02439980096449533)]
Topic 56:
[('dmt', 0.2904789559151491), ('dmt vendor', 0.053730638923248135), ('aco dmt', 0.05260313518578659), ('aco', 0.051665177663252904), ('changa', 0.03326465228271408), ('for dmt', 0.03291369008538988), ('meo dmt', 0.030423677339823103), ('dmt vendors', 0.02913423306345144), ('dmt changa', 0.025253740985173603), ('meo', 0.02496057714410078)]
Topic 57:
[('escrow', 0.20917833515749504), ('in escrow', 0.03831598055804673), ('extend', 0.02307698150961151), ('extend escrow', 0.022350652521144168), ('escrow escrow', 0.01937368965193982), ('on escrow', 0.019133252797808968), ('escrow is', 0.01785189107154408), ('to extend', 0.015178660518416464), ('funds', 0.014105003627973312), ('with escrow', 0.014089956110501688)]
Topic 58:
[('mirror', 0.1999115213913919), ('mirrors', 0.17114555752439659), ('working', 0.09230346542206823), ('working mirror', 0.06698074861901485), ('working mirrors', 0.05348738062554266), ('mirror links', 0.041331468169803175), ('pm', 0.03250728698084981), ('mirror please', 0.02993888348824195), ('empire', 0.029327173164164985), ('pm me', 0.027800481444538137)]
Topic 59:
[('fraud', 0.20527232427652484), ('fraudsters', 0.029587415280798727), ('fraudfox', 0.02306152535746591), ('loan fraud', 0.02208411057803487), ('loan', 0.020765310304769444), ('for fraud', 0.017402408525299756), ('of fraud', 0.015875099058975835), ('best fraud', 0.01432044731610317), ('fraud and', 0.013749148978319373), ('forums', 0.013241012810604615)]
Topic 60:
[('bank', 0.16994005584365096), ('logs', 0.09399747297956386), ('bank logs', 0.07828367201948147), ('bank account', 0.046571339197672396), ('account', 0.03631266913625938), ('accounts', 0.032668200950556496), ('logins', 0.03198527642482958), ('bank logins', 0.028829321452800767), ('fullz', 0.02663355105468197), ('banks', 0.025240976495874503)]
Topic 61:
[('opsec', 0.2736853559343139), ('opsec for', 0.02909894491515325), ('opsec opsec', 0.02828295978078474), ('opsec and', 0.025348928187423684), ('opsec question', 0.02329669639014293), ('opsec guide', 0.022093620273672798), ('good opsec', 0.019959592629263525), ('bad opsec', 0.018855306520523158), ('my opsec', 0.01718392687952329), ('dnm', 0.016706764016850874)]
Topic 62:
[('fentanyl', 0.1955077892521244), ('fent', 0.08198709355740003), ('carfentanil', 0.025983205225545265), ('fentanyl distribution', 0.018954918299687127), ('of fentanyl', 0.018954918299687127), ('selling fentanyl', 0.018954918299687127), ('for fentanyl', 0.018954918299687127), ('distribution', 0.01807846715551803), ('analogues', 0.01713333362248868), ('in', 0.015865359129560205)]
Topic 63:
[('apollon', 0.18726518782162851), ('apollon market', 0.10163168288299376), ('on apollon', 0.0943424516858095), ('market', 0.04339820103949332), ('mysteryland', 0.03511425301193948), ('apollon apollon', 0.03036889048705953), ('on', 0.02815918727792239), ('to apollon', 0.022217092836984975), ('now', 0.022060915825673807), ('now on', 0.01946998401413869)]
Topic 64:
[('cgmc', 0.19823170606174625), ('invite', 0.18193935736305442), ('invite code', 0.08872314369090809), ('code', 0.0741401474069647), ('cgmc invite', 0.062124405870338295), ('on cgmc', 0.039104535817030044), ('an invite', 0.036559756378421876), ('invite codes', 0.03451579499235061), ('invites', 0.0334036319170144), ('codes', 0.03281925512609211)]
Topic 65:
[('aus', 0.0873531594985505), ('australia', 0.06952251222503282), ('aussie', 0.050977518898740276), ('australian', 0.03343382333915655), ('auspost', 0.029562213419453396), ('to australia', 0.02920361228056182), ('to aus', 0.02556850443461526), ('aussies', 0.025002204495531297), ('aussiehits', 0.0223786664446106), ('auspride', 0.02153665064886614)]
Topic 66:
[('phishing', 0.17699950168725506), ('phished', 0.06892906327420083), ('phishing links', 0.052061774436907814), ('links', 0.045282671799454555), ('warning', 0.028705934769355113), ('beware', 0.022619811704386532), ('got phished', 0.022599402854112032), ('link', 0.02249999517006053), ('phishing site', 0.02033946256870083), ('phishing link', 0.020151841887474525)]
Topic 67:
[('wallstreet', 0.18032562887151804), ('on wallstreet', 0.08164141622247753), ('wall', 0.04921975962741589), ('wallstreet market', 0.04187595188848951), ('wall st', 0.04158761120166923), ('st', 0.04115671976566791), ('wallstreetmarket', 0.039058613883569425), ('wallst', 0.03373377136206345), ('on', 0.02829346360257876), ('market', 0.020489164137491248)]
Topic 68:
[('chemicals', 0.10016938894460503), ('research', 0.09281847348498225), ('research chemicals', 0.07118366551106406), ('chems', 0.056909133858895296), ('chemical', 0.05475666577774158), ('research chemical', 0.034301108450939845), ('chem', 0.03224773598075937), ('chemist', 0.024020545800946164), ('com', 0.020917020525176258), ('chemtheory', 0.019376125674780503)]
Topic 69:
[('xtc', 0.06975510013050482), ('xmf', 0.06371143075822816), ('xans', 0.05423739326291133), ('xan', 0.052651185021238125), ('xanmasterfrank', 0.0506170121851223), ('xanos', 0.02402097080505311), ('xansalad', 0.020681665033123523), ('xtc vendor', 0.020240668489923782), ('xan vendor', 0.018096456903983083), ('free xtc', 0.01609315550468023)]
Topic 70:
[('2c', 0.21040678226208337), ('2cb', 0.14155506019153832), ('cb', 0.026888453075878907), ('2cb vendors', 0.024518631692465168), ('for 2c', 0.024518631692465168), ('for 2cb', 0.022197936333833275), ('2c 2c', 0.022197936333833275), ('domestic', 0.021947515946964532), ('us', 0.021687589917266496), ('domestic 2c', 0.021359286959244696)]
Topic 71:
[('review', 0.1293235354899173), ('reviews', 0.08095409212161189), ('review review', 0.04953955668613089), ('feedback', 0.021504094900986072), ('karmaking', 0.013294560556512758), ('reviews on', 0.011950872473099866), ('post', 0.011790311779232664), ('reviewer', 0.011706915341104806), ('sample review', 0.010996486731792397), ('order review', 0.010931383517996723)]
Topic 72:
[('samples', 0.14866103068403283), ('free samples', 0.09900928835552782), ('free', 0.08668026193176706), ('sample', 0.05214012554204901), ('samples on', 0.028360437056176527), ('samples free', 0.02512163566406409), ('free sample', 0.022945404374660198), ('samples left', 0.016630035400317673), ('our products', 0.016630035400317673), ('on all', 0.016404806972083175)]
Topic -1:
[('to', 0.009994879918409097), ('the', 0.008779454354158061), ('is', 0.008650560310022778), ('for', 0.008340702261717663), ('anyone', 0.007211720090693047), ('with', 0.006872813640534324), ('how', 0.006854467093263099), ('and', 0.006784165355486146), ('in', 0.006723924586602931), ('on', 0.006544858676467028)]
In [164]:
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.451997309923172
Davies_bouldin_score: 1.137154174755223
In [165]:
topic_words = topic_model.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics_ll:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in tc1.corpus],
    dictionary=corpora.Dictionary([doc.split() for doc in tc1.corpus]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.5139428428742826
In [166]:
topic_model.visualize_barchart(top_n_topics=70, custom_labels=True, n_words=10)
In [161]:
sentence = ['sell xanax, coke, weed, gun and password']
tp, pr = topic_model.transform(sentence)
top_indices = np.argsort(pr[0])[::-1][:5]
top_topics = [(topic_model.get_topic(i), pr[0][i], topic_model.generate_topic_labels()[i+1]) for i in top_indices]
df_finals = pd.DataFrame(top_topics, columns=['Topic', 'Probability', 'Label'])
df_finals['Words'] = df_finals['Topic'].apply(lambda topic: [word for word, prob in topic])
df_finals['Sentence'] = sentence * len(df_finals)
df_finals
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
2024-07-01 18:07:22,315 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-07-01 18:07:32,083 - BERTopic - Dimensionality - Completed ✓
2024-07-01 18:07:32,083 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-07-01 18:07:32,099 - BERTopic - Probabilities - Start calculation of probabilities with HDBSCAN
2024-07-01 18:07:32,212 - BERTopic - Probabilities - Completed ✓
2024-07-01 18:07:32,212 - BERTopic - Cluster - Completed ✓
Out[161]:
Topic Probability Label Words Sentence
0 [(alprazolam, 0.06763628572821677), (diazepam,... 0.013500 29_alprazolam_diazepam_powder [alprazolam, diazepam, powder, valium, etizola... sell xanax, coke, weed, gun and password
1 [(xanax, 0.13073115730988635), (bars, 0.032445... 0.013155 8_xanax_bars_xanax bars [xanax, bars, xanax bars, 2mg, xanax vendor, 3... sell xanax, coke, weed, gun and password
2 [(oxycodone, 0.061735118720444515), (oxy, 0.04... 0.008704 21_oxycodone_oxy_opiateconnect [oxycodone, oxy, opiateconnect, 80mg, opiates,... sell xanax, coke, weed, gun and password
3 [(benzo, 0.10894708695513186), (benzos, 0.0934... 0.007228 50_benzo_benzos_rc [benzo, benzos, rc, rc benzos, benzobananas, b... sell xanax, coke, weed, gun and password
4 [(pills, 0.11040973400268793), (xtc, 0.0623399... 0.007184 44_pills_xtc_xtc pills [pills, xtc, xtc pills, pill, ecstasy, pill pr... sell xanax, coke, weed, gun and password
In [ ]:
 

KeyBERT¶

In [169]:
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
docs = df['name_thread'].tolist()
In [85]:
kw_model = KeyBERT()
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
docs = df['name_thread'].tolist()
keywords = kw_model.extract_keywords(docs)
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))
In [92]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding = model.encode(vocabulary, batch_size=32, show_progress_bar=True)
Batches: 100%|██████████| 726/726 [01:28<00:00,  8.24it/s]
In [167]:
vectorizer_model= CountVectorizer(analyzer = 'word', ngram_range = (1, 5), vocabulary = vocabulary, lowercase=False)
topic_model_kw = BERTopic(vectorizer_model=vectorizer_model, nr_topics='auto', min_topic_size=120)
In [ ]:
topics_kw, probs_kw = topic_model_kw.fit_transform(df.name_thread)
In [105]:
topic_model_kw.visualize_barchart(top_n_topics=200, custom_labels=True, n_words=10)
In [106]:
topic_model_kw.get_topic_info()
Out[106]:
Topic Count Name Representation Representative_Docs
0 -1 23811 -1_order_vendor_account_review [order, vendor, account, review, mg, new, look... [best market bulk bar alp powder, uk best pric...
1 0 11480 0_weed_mg_xanax_cart [weed, mg, xanax, cart, lsd, cocaine, pill, ca... [product review Dimmey Strawberry Banana Sherb...
2 1 6261 1_scammer_scam_vendor_scamming [scammer, scam, vendor, scamming, market, drea... [scammer, SOLUTION TO market EXIT scamming, Pa...
3 2 2273 2_guy_help_sub_post [guy, help, sub, post, message, day, advice, u... [Sup guy new, please help advice, sub Please R...
4 3 1760 3_empire_working_deposit_market [empire, working, deposit, market, link, ticke... [top Notch DM Vendor Mahatma empire, JerryGarc...
... ... ... ... ... ...
60 59 124 59_vending_libertas_market_network [vending, libertas, market, network, start, mo... [vending Nightmare Market, vending Labels, ven...
61 60 123 60_subdread_subdreads_create_welcome [subdread, subdreads, create, welcome, require... [New subdread Developer Project Partnerships, ...
62 61 123 61_seized_seizure_package_raided [seized, seizure, package, raided, letter, not... [Need Help Damage Control seized, Mail seized,...
63 62 122 62_tochka_vps_nocturno_transaction [tochka, vps, nocturno, transaction, installat... [VENDOR OF LEAN tochka, Waxthtazz tochka, tochka]
64 63 121 63_berlusconi_market_lovedoctor_good [berlusconi, market, lovedoctor, good, scammer... [market good berlusconi, cant see berlusconi m...

65 rows × 5 columns

In [108]:
topic_words = topic_model_kw.get_topics()
topics_ll = [topic_words[i] for i in range(len(topic_words) - 1) if i != -1]

topn = 10
topic_list = []
for topic in topics_ll:
    topic_list.append([word for word, _ in topic[:topn]])

coherence_model = CoherenceModel(
    topics=topic_list,
    texts=[doc.split() for doc in df.name_thread],
    dictionary=corpora.Dictionary([doc.split() for doc in df.name_thread]),
    coherence='c_v'
)

print(f"Coherence Model: {coherence_model.get_coherence()}")
Coherence Model: 0.36753855014096726
In [109]:
topic_model_kw.visualize_topics()
In [ ]: